summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.h3
-rw-r--r--fs/9p/v9fs.h11
-rw-r--r--fs/9p/vfs_addr.c60
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_inode.c44
-rw-r--r--fs/9p/vfs_inode_dotl.c28
-rw-r--r--fs/9p/vfs_super.c19
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/rotate.c8
-rw-r--r--fs/afs/validation.c4
-rw-r--r--fs/afs/write.c189
-rw-r--r--fs/aio.c91
-rw-r--r--fs/anon_inodes.c33
-rw-r--r--fs/bcachefs/acl.c30
-rw-r--r--fs/bcachefs/alloc_background.c4
-rw-r--r--fs/bcachefs/alloc_background.h8
-rw-r--r--fs/bcachefs/backpointers.c10
-rw-r--r--fs/bcachefs/backpointers.h17
-rw-r--r--fs/bcachefs/bcachefs.h2
-rw-r--r--fs/bcachefs/bcachefs_format.h36
-rw-r--r--fs/bcachefs/bkey.h6
-rw-r--r--fs/bcachefs/bkey_methods.c8
-rw-r--r--fs/bcachefs/btree_cache.c73
-rw-r--r--fs/bcachefs/btree_gc.c29
-rw-r--r--fs/bcachefs/btree_io.c19
-rw-r--r--fs/bcachefs/btree_iter.h11
-rw-r--r--fs/bcachefs/btree_journal_iter.c67
-rw-r--r--fs/bcachefs/btree_key_cache.c39
-rw-r--r--fs/bcachefs/btree_locking.c28
-rw-r--r--fs/bcachefs/btree_node_scan.c39
-rw-r--r--fs/bcachefs/btree_node_scan_types.h1
-rw-r--r--fs/bcachefs/btree_trans_commit.c34
-rw-r--r--fs/bcachefs/btree_types.h16
-rw-r--r--fs/bcachefs/btree_update_interior.c213
-rw-r--r--fs/bcachefs/btree_update_interior.h3
-rw-r--r--fs/bcachefs/btree_write_buffer.c14
-rw-r--r--fs/bcachefs/buckets.c2
-rw-r--r--fs/bcachefs/buckets.h8
-rw-r--r--fs/bcachefs/chardev.c102
-rw-r--r--fs/bcachefs/checksum.c24
-rw-r--r--fs/bcachefs/checksum.h5
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c17
-rw-r--r--fs/bcachefs/debug.c75
-rw-r--r--fs/bcachefs/ec.c54
-rw-r--r--fs/bcachefs/ec.h2
-rw-r--r--fs/bcachefs/errcode.h1
-rw-r--r--fs/bcachefs/extents.c11
-rw-r--r--fs/bcachefs/eytzinger.c8
-rw-r--r--fs/bcachefs/eytzinger.h26
-rw-r--r--fs/bcachefs/fs-io-direct.c19
-rw-r--r--fs/bcachefs/fs-io.c16
-rw-r--r--fs/bcachefs/fs.c14
-rw-r--r--fs/bcachefs/inode.c2
-rw-r--r--fs/bcachefs/io_write.c30
-rw-r--r--fs/bcachefs/journal.c8
-rw-r--r--fs/bcachefs/journal_io.c77
-rw-r--r--fs/bcachefs/journal_reclaim.c2
-rw-r--r--fs/bcachefs/journal_types.h1
-rw-r--r--fs/bcachefs/move.c22
-rw-r--r--fs/bcachefs/opts.c29
-rw-r--r--fs/bcachefs/opts.h10
-rw-r--r--fs/bcachefs/quota.c8
-rw-r--r--fs/bcachefs/recovery.c22
-rw-r--r--fs/bcachefs/recovery_passes.c2
-rw-r--r--fs/bcachefs/sb-clean.c22
-rw-r--r--fs/bcachefs/sb-downgrade.c5
-rw-r--r--fs/bcachefs/sb-errors_types.h6
-rw-r--r--fs/bcachefs/sb-members.c59
-rw-r--r--fs/bcachefs/sb-members.h25
-rw-r--r--fs/bcachefs/snapshot.c19
-rw-r--r--fs/bcachefs/super-io.c58
-rw-r--r--fs/bcachefs/super.c21
-rw-r--r--fs/bcachefs/super_types.h2
-rw-r--r--fs/bcachefs/sysfs.c17
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/bcachefs/thread_with_file.c15
-rw-r--r--fs/bcachefs/thread_with_file.h3
-rw-r--r--fs/bcachefs/util.h10
-rw-r--r--fs/btrfs/backref.c12
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/extent-tree.c8
-rw-r--r--fs/btrfs/extent_io.c21
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/inode.c31
-rw-r--r--fs/btrfs/ioctl.c70
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/qgroup.c25
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/root-tree.h2
-rw-r--r--fs/btrfs/scrub.c18
-rw-r--r--fs/btrfs/tests/extent-map-tests.c5
-rw-r--r--fs/btrfs/transaction.c19
-rw-r--r--fs/btrfs/tree-checker.c30
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/volumes.c1
-rw-r--r--fs/cachefiles/io.c76
-rw-r--r--fs/ceph/addr.c28
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/inode.c198
-rw-r--r--fs/direct-io.c1
-rw-r--r--fs/ecryptfs/keystore.c4
-rw-r--r--fs/erofs/fscache.c2
-rw-r--r--fs/erofs/internal.h7
-rw-r--r--fs/erofs/super.c124
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exfat/dir.c2
-rw-r--r--fs/exfat/file.c7
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/f2fs/file.c3
-rw-r--r--fs/fcntl.c20
-rw-r--r--fs/fhandle.c6
-rw-r--r--fs/freevxfs/vxfs_super.c69
-rw-r--r--fs/fs-writeback.c57
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/dir.c1
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/fuse/fuse_i.h7
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/fuse/iomode.c60
-rw-r--r--fs/fuse/passthrough.c2
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/iomap/buffered-io.c119
-rw-r--r--fs/jffs2/xattr.c3
-rw-r--r--fs/kernfs/file.c9
-rw-r--r--fs/libfs.c55
-rw-r--r--fs/minix/inode.c48
-rw-r--r--fs/namei.c19
-rw-r--r--fs/netfs/Makefile3
-rw-r--r--fs/netfs/buffered_read.c40
-rw-r--r--fs/netfs/buffered_write.c850
-rw-r--r--fs/netfs/direct_write.c56
-rw-r--r--fs/netfs/fscache_io.c14
-rw-r--r--fs/netfs/internal.h55
-rw-r--r--fs/netfs/io.c162
-rw-r--r--fs/netfs/main.c55
-rw-r--r--fs/netfs/misc.c10
-rw-r--r--fs/netfs/objects.c81
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c17
-rw-r--r--fs/netfs/write_collect.c808
-rw-r--r--fs/netfs/write_issue.c684
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/inode.c7
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs4callback.c26
-rw-r--r--fs/nfsd/nfs4xdr.c49
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/ntfs3/Kconfig9
-rw-r--r--fs/ntfs3/dir.c7
-rw-r--r--fs/ntfs3/file.c8
-rw-r--r--fs/ntfs3/inode.c20
-rw-r--r--fs/ntfs3/ntfs_fs.h4
-rw-r--r--fs/ntfs3/super.c65
-rw-r--r--fs/openpromfs/inode.c8
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/namei.c26
-rw-r--r--fs/orangefs/super.c17
-rw-r--r--fs/overlayfs/params.c4
-rw-r--r--fs/proc/bootconfig.c12
-rw-r--r--fs/proc/fd.c42
-rw-r--r--fs/proc/page.c7
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/task_mmu.c24
-rw-r--r--fs/qnx6/inode.c117
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/seq_file.c13
-rw-r--r--fs/smb/client/Kconfig1
-rw-r--r--fs/smb/client/cached_dir.c4
-rw-r--r--fs/smb/client/cifsfs.c127
-rw-r--r--fs/smb/client/cifsfs.h11
-rw-r--r--fs/smb/client/cifsglob.h69
-rw-r--r--fs/smb/client/cifspdu.h4
-rw-r--r--fs/smb/client/cifsproto.h21
-rw-r--r--fs/smb/client/cifssmb.c120
-rw-r--r--fs/smb/client/connect.c29
-rw-r--r--fs/smb/client/file.c2720
-rw-r--r--fs/smb/client/fs_context.c33
-rw-r--r--fs/smb/client/fs_context.h4
-rw-r--r--fs/smb/client/fscache.c129
-rw-r--r--fs/smb/client/fscache.h54
-rw-r--r--fs/smb/client/inode.c48
-rw-r--r--fs/smb/client/misc.c14
-rw-r--r--fs/smb/client/smb2misc.c10
-rw-r--r--fs/smb/client/smb2ops.c111
-rw-r--r--fs/smb/client/smb2pdu.c205
-rw-r--r--fs/smb/client/smb2pdu.h2
-rw-r--r--fs/smb/client/smb2proto.h5
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/client/trace.h236
-rw-r--r--fs/smb/client/transport.c24
-rw-r--r--fs/smb/common/smb2pdu.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h35
-rw-r--r--fs/smb/server/oplock.c65
-rw-r--r--fs/smb/server/server.c13
-rw-r--r--fs/smb/server/smb2pdu.c21
-rw-r--r--fs/smb/server/smb_common.c4
-rw-r--r--fs/smb/server/transport_tcp.c4
-rw-r--r--fs/smb/server/vfs.c5
-rw-r--r--fs/smb/server/vfs_cache.c28
-rw-r--r--fs/smb/server/vfs_cache.h2
-rw-r--r--fs/squashfs/inode.c5
-rw-r--r--fs/stat.c1
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/tracefs/event_inode.c162
-rw-r--r--fs/tracefs/inode.c288
-rw-r--r--fs/tracefs/internal.h14
-rw-r--r--fs/userfaultfd.c4
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--fs/zonefs/super.c2
223 files changed, 5731 insertions, 6098 deletions
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 29281b7c3887..0d6138bee2a3 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -49,9 +49,6 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags,
unsigned int s_cache, unsigned int f_flags)
{
- if (fid->qid.type != P9_QTFILE)
- return;
-
if ((!s_cache) ||
((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) ||
(s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 9defa12208f9..1775fcc7f0e8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,13 +179,14 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
-extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid);
+extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid,
+ bool new);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern const struct netfs_request_ops v9fs_req_ops;
extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
- struct p9_fid *fid);
+ struct p9_fid *fid, bool new);
/* other default globals */
#define V9FS_PORT 564
@@ -224,12 +225,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
*/
static inline struct inode *
v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb)
+ struct super_block *sb, bool new)
{
if (v9fs_proto_dotl(v9ses))
- return v9fs_fid_iget_dotl(sb, fid);
+ return v9fs_fid_iget_dotl(sb, fid, new);
else
- return v9fs_fid_iget(sb, fid);
+ return v9fs_fid_iget(sb, fid, new);
}
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 047855033d32..a97ceb105cd8 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -26,36 +26,38 @@
#include "cache.h"
#include "fid.h"
-static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+/*
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+static void v9fs_begin_writeback(struct netfs_io_request *wreq)
{
- struct p9_fid *fid = subreq->rreq->netfs_priv;
- int err, len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
- netfs_write_subrequest_terminated(subreq, len ?: err, false);
-}
+ struct p9_fid *fid;
-static void v9fs_upload_to_server_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
+ fid = v9fs_fid_find_inode(wreq->inode, true, INVALID_UID, true);
+ if (!fid) {
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ wreq->inode->i_ino);
+ return;
+ }
- v9fs_upload_to_server(subreq);
+ wreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+ if (fid->iounit)
+ wreq->wsize = min(wreq->wsize, fid->iounit);
+ wreq->netfs_priv = fid;
+ wreq->io_streams[0].avail = true;
}
/*
- * Set up write requests for a writeback slice. We need to add a write request
- * for each write we want to make.
+ * Issue a subrequest to write to the server.
*/
-static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct netfs_io_subrequest *subreq;
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
- subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
- start, len, v9fs_upload_to_server_worker);
- if (subreq)
- netfs_queue_write_request(subreq);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
}
/**
@@ -87,12 +89,16 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct p9_fid *fid;
bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
- rreq->origin == NETFS_WRITEBACK ||
rreq->origin == NETFS_WRITETHROUGH ||
- rreq->origin == NETFS_LAUNDER_WRITE ||
rreq->origin == NETFS_UNBUFFERED_WRITE ||
rreq->origin == NETFS_DIO_WRITE);
+ if (rreq->origin == NETFS_WRITEBACK)
+ return 0; /* We don't get the write handle until we find we
+ * have actually dirty data and not just
+ * copy-to-cache data.
+ */
+
if (file) {
fid = file->private_data;
if (!fid)
@@ -104,6 +110,10 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
goto no_fid;
}
+ rreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+ if (fid->iounit)
+ rreq->wsize = min(rreq->wsize, fid->iounit);
+
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
@@ -132,7 +142,8 @@ const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
.issue_read = v9fs_issue_read,
- .create_write_requests = v9fs_create_write_requests,
+ .begin_writeback = v9fs_begin_writeback,
+ .issue_write = v9fs_issue_write,
};
const struct address_space_operations v9fs_addr_operations = {
@@ -141,7 +152,6 @@ const struct address_space_operations v9fs_addr_operations = {
.dirty_folio = netfs_dirty_folio,
.release_folio = netfs_release_folio,
.invalidate_folio = netfs_invalidate_folio,
- .launder_folio = netfs_launder_folio,
.direct_IO = noop_direct_IO,
.writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index abdbbaee5184..348cc90bf9c5 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -520,6 +520,7 @@ const struct file_operations v9fs_file_operations = {
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync,
+ .setlease = simple_nosetlease,
};
const struct file_operations v9fs_file_operations_dotl = {
@@ -534,4 +535,5 @@ const struct file_operations v9fs_file_operations_dotl = {
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync_dotl,
+ .setlease = simple_nosetlease,
};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b01b1bbf2493..7a3308d77606 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -83,7 +83,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses,
int res;
int mode = stat->mode;
- res = mode & S_IALLUGO;
+ res = mode & 0777; /* S_IRWXUGO */
if (v9fs_proto_dotu(v9ses)) {
if ((mode & P9_DMSETUID) == P9_DMSETUID)
res |= S_ISUID;
@@ -178,6 +178,9 @@ int v9fs_uflags2omode(int uflags, int extended)
break;
}
+ if (uflags & O_TRUNC)
+ ret |= P9_OTRUNC;
+
if (extended) {
if (uflags & O_EXCL)
ret |= P9_OEXCL;
@@ -361,7 +364,8 @@ void v9fs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
+struct inode *
+v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new)
{
dev_t rdev;
int retval;
@@ -373,8 +377,18 @@ struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
inode = iget_locked(sb, QID2INO(&fid->qid));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
+ if (!(inode->i_state & I_NEW)) {
+ if (!new) {
+ goto done;
+ } else {
+ p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n",
+ inode->i_ino);
+ iput(inode);
+ remove_inode_hash(inode);
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ WARN_ON(!(inode->i_state & I_NEW));
+ }
+ }
/*
* initialize the inode with the stat info
@@ -398,11 +412,11 @@ struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
+done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
-
}
/**
@@ -434,8 +448,15 @@ static int v9fs_at_to_dotl_flags(int flags)
*/
static void v9fs_dec_count(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
- drop_nlink(inode);
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) {
+ if (inode->i_nlink) {
+ drop_nlink(inode);
+ } else {
+ p9_debug(P9_DEBUG_VFS,
+ "WARNING: unexpected i_nlink zero %d inode %ld\n",
+ inode->i_nlink, inode->i_ino);
+ }
+ }
}
/**
@@ -486,6 +507,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
} else
v9fs_dec_count(inode);
+ if (inode->i_nlink <= 0) /* no more refs unhash it */
+ remove_inode_hash(inode);
+
v9fs_invalidate_inode_attr(inode);
v9fs_invalidate_inode_attr(dir);
@@ -551,7 +575,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
/*
* instantiate inode and assign the unopened fid to the dentry
*/
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
@@ -680,7 +704,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
else
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false);
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -1061,8 +1085,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct v9fs_session_info *v9ses = sb->s_fs_info;
struct v9fs_inode *v9inode = V9FS_I(inode);
- set_nlink(inode, 1);
-
inode_set_atime(inode, stat->atime, 0);
inode_set_mtime(inode, stat->mtime, 0);
inode_set_ctime(inode, stat->mtime, 0);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 55dde186041a..c61b97bd13b9 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,7 +52,10 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
-struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
+
+
+struct inode *
+v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
{
int retval;
struct inode *inode;
@@ -62,8 +65,18 @@ struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
inode = iget_locked(sb, QID2INO(&fid->qid));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
+ if (!(inode->i_state & I_NEW)) {
+ if (!new) {
+ goto done;
+ } else { /* deal with race condition in inode number reuse */
+ p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n",
+ inode->i_ino);
+ iput(inode);
+ remove_inode_hash(inode);
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ WARN_ON(!(inode->i_state & I_NEW));
+ }
+ }
/*
* initialize the inode with the stat info
@@ -90,12 +103,11 @@ struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
goto error;
unlock_new_inode(inode);
-
+done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
-
}
struct dotl_openflag_map {
@@ -247,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
goto out;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -340,7 +352,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -776,7 +788,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
err);
goto error;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 4236058c7bbd..f52fdf42945c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -139,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
else
sb->s_d_op = &v9fs_dentry_operations;
- inode = v9fs_get_inode_from_fid(v9ses, fid, sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
goto release_sb;
@@ -244,6 +244,21 @@ done:
return res;
}
+static int v9fs_drop_inode(struct inode *inode)
+{
+ struct v9fs_session_info *v9ses;
+
+ v9ses = v9fs_inode2v9ses(inode);
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+ return generic_drop_inode(inode);
+ /*
+ * in case of non cached mode always drop the
+ * inode because we want the inode attribute
+ * to always match that on the server.
+ */
+ return 1;
+}
+
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
@@ -268,6 +283,7 @@ static const struct super_operations v9fs_super_ops = {
.alloc_inode = v9fs_alloc_inode,
.free_inode = v9fs_free_inode,
.statfs = simple_statfs,
+ .drop_inode = v9fs_drop_inode,
.evict_inode = v9fs_evict_inode,
.show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
@@ -278,6 +294,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
.alloc_inode = v9fs_alloc_inode,
.free_inode = v9fs_free_inode,
.statfs = v9fs_statfs,
+ .drop_inode = v9fs_drop_inode,
.evict_inode = v9fs_evict_inode,
.show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index ef2cc8f565d2..c3f0c45ae9a9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -54,7 +54,6 @@ const struct address_space_operations afs_file_aops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = netfs_dirty_folio,
- .launder_folio = netfs_launder_folio,
.release_folio = netfs_release_folio,
.invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
@@ -354,7 +353,7 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
if (file)
rreq->netfs_priv = key_get(afs_file_key(file));
rreq->rsize = 256 * 1024;
- rreq->wsize = 256 * 1024;
+ rreq->wsize = 256 * 1024 * 1024;
return 0;
}
@@ -369,6 +368,7 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
static void afs_free_request(struct netfs_io_request *rreq)
{
key_put(rreq->netfs_priv);
+ afs_put_wb_key(rreq->netfs_priv2);
}
static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
@@ -400,7 +400,9 @@ const struct netfs_request_ops afs_req_ops = {
.issue_read = afs_issue_read,
.update_i_size = afs_update_i_size,
.invalidate_cache = afs_netfs_invalidate_cache,
- .create_write_requests = afs_create_write_requests,
+ .begin_writeback = afs_begin_writeback,
+ .prepare_write = afs_prepare_write,
+ .issue_write = afs_issue_write,
};
static void afs_add_open_mmap(struct afs_vnode *vnode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ce5a612937c..6e1d3c4daf72 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -916,7 +916,6 @@ struct afs_operation {
loff_t pos;
loff_t size;
loff_t i_size;
- bool laundering; /* Laundering page, PG_writeback not set */
} store;
struct {
struct iattr *attr;
@@ -1599,11 +1598,14 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
+void afs_prepare_write(struct netfs_io_subrequest *subreq);
+void afs_issue_write(struct netfs_io_subrequest *subreq);
+void afs_begin_writeback(struct netfs_io_request *wreq);
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ed04bd1eeae8..ed09d4d4c211 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -541,11 +541,13 @@ pick_server:
test_bit(AFS_SE_EXCLUDED, &se->flags) ||
!test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
continue;
- es = op->server_states->endpoint_state;
+ es = op->server_states[i].endpoint_state;
sal = es->addresses;
afs_get_address_preferences_rcu(op->net, sal);
for (j = 0; j < sal->nr_addrs; j++) {
+ if (es->failed_set & (1 << j))
+ continue;
if (!sal->addrs[j].peer)
continue;
if (sal->addrs[j].prio > best_prio) {
@@ -605,6 +607,8 @@ iterate_address:
best_prio = -1;
addr_index = 0;
for (i = 0; i < alist->nr_addrs; i++) {
+ if (!(set & (1 << i)))
+ continue;
if (alist->addrs[i].prio > best_prio) {
addr_index = i;
best_prio = alist->addrs[i].prio;
@@ -674,7 +678,7 @@ no_more_servers:
for (i = 0; i < op->server_list->nr_servers; i++) {
struct afs_endpoint_state *estate;
- estate = op->server_states->endpoint_state;
+ estate = op->server_states[i].endpoint_state;
error = READ_ONCE(estate->error);
if (error < 0)
afs_op_accumulate_error(op, error, estate->abort_code);
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 32a53fc8dfb2..bef8af12ebe2 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -365,9 +365,9 @@ static void afs_zap_data(struct afs_vnode *vnode)
* written back in a regular file and completely discard the pages in a
* directory or symlink */
if (S_ISREG(vnode->netfs.inode.i_mode))
- invalidate_remote_inode(&vnode->netfs.inode);
+ filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
else
- invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
+ filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
}
/*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 74402d95a884..e959640694c2 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -29,43 +29,39 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign
/*
* Find a key to use for the writeback. We cached the keys used to author the
- * writes on the vnode. *_wbk will contain the last writeback key used or NULL
- * and we need to start from there if it's set.
+ * writes on the vnode. wreq->netfs_priv2 will contain the last writeback key
+ * record used or NULL and we need to start from there if it's set.
+ * wreq->netfs_priv will be set to the key itself or NULL.
*/
-static int afs_get_writeback_key(struct afs_vnode *vnode,
- struct afs_wb_key **_wbk)
+static void afs_get_writeback_key(struct netfs_io_request *wreq)
{
- struct afs_wb_key *wbk = NULL;
- struct list_head *p;
- int ret = -ENOKEY, ret2;
+ struct afs_wb_key *wbk, *old = wreq->netfs_priv2;
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+
+ key_put(wreq->netfs_priv);
+ wreq->netfs_priv = NULL;
+ wreq->netfs_priv2 = NULL;
spin_lock(&vnode->wb_lock);
- if (*_wbk)
- p = (*_wbk)->vnode_link.next;
+ if (old)
+ wbk = list_next_entry(old, vnode_link);
else
- p = vnode->wb_keys.next;
+ wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link);
- while (p != &vnode->wb_keys) {
- wbk = list_entry(p, struct afs_wb_key, vnode_link);
+ list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) {
_debug("wbk %u", key_serial(wbk->key));
- ret2 = key_validate(wbk->key);
- if (ret2 == 0) {
+ if (key_validate(wbk->key) == 0) {
refcount_inc(&wbk->usage);
+ wreq->netfs_priv = key_get(wbk->key);
+ wreq->netfs_priv2 = wbk;
_debug("USE WB KEY %u", key_serial(wbk->key));
break;
}
-
- wbk = NULL;
- if (ret == -ENOKEY)
- ret = ret2;
- p = p->next;
}
spin_unlock(&vnode->wb_lock);
- if (*_wbk)
- afs_put_wb_key(*_wbk);
- *_wbk = wbk;
- return 0;
+
+ afs_put_wb_key(old);
}
static void afs_store_data_success(struct afs_operation *op)
@@ -75,8 +71,7 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
if (!afs_op_error(op)) {
- if (!op->store.laundering)
- afs_pages_written_back(vnode, op->store.pos, op->store.size);
+ afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes);
}
@@ -89,113 +84,125 @@ static const struct afs_operation_ops afs_store_data_operation = {
};
/*
- * write to a file
+ * Prepare a subrequest to write to the server. This sets the max_len
+ * parameter.
+ */
+void afs_prepare_write(struct netfs_io_subrequest *subreq)
+{
+ //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
+ // subreq->max_len = 512 * 1024;
+ //else
+ subreq->max_len = 256 * 1024 * 1024;
+}
+
+/*
+ * Issue a subrequest to write to the server.
*/
-static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
- bool laundering)
+static void afs_issue_write_worker(struct work_struct *work)
{
+ struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
+ struct netfs_io_request *wreq = subreq->rreq;
struct afs_operation *op;
- struct afs_wb_key *wbk = NULL;
- loff_t size = iov_iter_count(iter);
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+ unsigned long long pos = subreq->start + subreq->transferred;
+ size_t len = subreq->len - subreq->transferred;
int ret = -ENOKEY;
- _enter("%s{%llx:%llu.%u},%llx,%llx",
+ _enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx",
+ wreq->debug_id, subreq->debug_index,
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- size, pos);
+ pos, len);
- ret = afs_get_writeback_key(vnode, &wbk);
- if (ret) {
- _leave(" = %d [no keys]", ret);
- return ret;
- }
+#if 0 // Error injection
+ if (subreq->debug_index == 3)
+ return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
- op = afs_alloc_operation(wbk->key, vnode->volume);
- if (IS_ERR(op)) {
- afs_put_wb_key(wbk);
- return -ENOMEM;
+ if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
}
+#endif
+
+ op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
+ if (IS_ERR(op))
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
afs_op_set_vnode(op, 0, vnode);
- op->file[0].dv_delta = 1;
+ op->file[0].dv_delta = 1;
op->file[0].modification = true;
- op->store.pos = pos;
- op->store.size = size;
- op->store.laundering = laundering;
- op->flags |= AFS_OPERATION_UNINTR;
- op->ops = &afs_store_data_operation;
+ op->store.pos = pos;
+ op->store.size = len;
+ op->flags |= AFS_OPERATION_UNINTR;
+ op->ops = &afs_store_data_operation;
-try_next_key:
afs_begin_vnode_operation(op);
- op->store.write_iter = iter;
- op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
- op->mtime = inode_get_mtime(&vnode->netfs.inode);
+ op->store.write_iter = &subreq->io_iter;
+ op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size);
+ op->mtime = inode_get_mtime(&vnode->netfs.inode);
afs_wait_for_operation(op);
-
- switch (afs_op_error(op)) {
+ ret = afs_put_operation(op);
+ switch (ret) {
case -EACCES:
case -EPERM:
case -ENOKEY:
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
- _debug("next");
-
- ret = afs_get_writeback_key(vnode, &wbk);
- if (ret == 0) {
- key_put(op->key);
- op->key = key_get(wbk->key);
- goto try_next_key;
- }
+ /* If there are more keys we can try, use the retry algorithm
+ * to rotate the keys.
+ */
+ if (wreq->netfs_priv2)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
break;
}
- afs_put_wb_key(wbk);
- _leave(" = %d", afs_op_error(op));
- return afs_put_operation(op);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
}
-static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
+void afs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
- ssize_t ret;
-
- _enter("%x[%x],%zx",
- subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
- subreq->rreq->origin == NETFS_LAUNDER_WRITE);
- netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
- false);
+ subreq->work.func = afs_issue_write_worker;
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ WARN_ON_ONCE(1);
}
-static void afs_upload_to_server_worker(struct work_struct *work)
+/*
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+void afs_begin_writeback(struct netfs_io_request *wreq)
{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- afs_upload_to_server(subreq);
+ afs_get_writeback_key(wreq);
+ wreq->io_streams[0].avail = true;
}
/*
- * Set up write requests for a writeback slice. We need to add a write request
- * for each write we want to make.
+ * Prepare to retry the writes in request. Use this to try rotating the
+ * available writeback keys.
*/
-void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream)
{
- struct netfs_io_subrequest *subreq;
-
- _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
+ struct netfs_io_subrequest *subreq =
+ list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
- subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
- start, len, afs_upload_to_server_worker);
- if (subreq)
- netfs_queue_write_request(subreq);
+ switch (subreq->error) {
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ afs_get_writeback_key(wreq);
+ if (!wreq->netfs_priv)
+ stream->failed = true;
+ break;
+ }
}
/*
diff --git a/fs/aio.c b/fs/aio.c
index 0f4f531c9780..6ed5507cd330 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -122,7 +122,7 @@ struct kioctx {
unsigned long mmap_base;
unsigned long mmap_size;
- struct page **ring_pages;
+ struct folio **ring_folios;
long nr_pages;
struct rcu_work free_rwork; /* see free_ioctx() */
@@ -160,7 +160,7 @@ struct kioctx {
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
- struct page *internal_pages[AIO_RING_PAGES];
+ struct folio *internal_folios[AIO_RING_PAGES];
struct file *aio_ring_file;
unsigned id;
@@ -334,19 +334,20 @@ static void aio_free_ring(struct kioctx *ctx)
put_aio_ring_file(ctx);
for (i = 0; i < ctx->nr_pages; i++) {
- struct page *page;
- pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
- page_count(ctx->ring_pages[i]));
- page = ctx->ring_pages[i];
- if (!page)
+ struct folio *folio = ctx->ring_folios[i];
+
+ if (!folio)
continue;
- ctx->ring_pages[i] = NULL;
- put_page(page);
+
+ pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i,
+ folio_ref_count(folio));
+ ctx->ring_folios[i] = NULL;
+ folio_put(folio);
}
- if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
- kfree(ctx->ring_pages);
- ctx->ring_pages = NULL;
+ if (ctx->ring_folios && ctx->ring_folios != ctx->internal_folios) {
+ kfree(ctx->ring_folios);
+ ctx->ring_folios = NULL;
}
}
@@ -441,7 +442,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
/* Make sure the old folio hasn't already been changed */
- if (ctx->ring_pages[idx] != &src->page)
+ if (ctx->ring_folios[idx] != src)
rc = -EAGAIN;
} else
rc = -EINVAL;
@@ -465,8 +466,8 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
folio_migrate_copy(dst, src);
- BUG_ON(ctx->ring_pages[idx] != &src->page);
- ctx->ring_pages[idx] = &dst->page;
+ BUG_ON(ctx->ring_folios[idx] != src);
+ ctx->ring_folios[idx] = dst;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
/* The old folio is no longer accessible. */
@@ -516,28 +517,30 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
- ctx->ring_pages = ctx->internal_pages;
+ ctx->ring_folios = ctx->internal_folios;
if (nr_pages > AIO_RING_PAGES) {
- ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!ctx->ring_pages) {
+ ctx->ring_folios = kcalloc(nr_pages, sizeof(struct folio *),
+ GFP_KERNEL);
+ if (!ctx->ring_folios) {
put_aio_ring_file(ctx);
return -ENOMEM;
}
}
for (i = 0; i < nr_pages; i++) {
- struct page *page;
- page = find_or_create_page(file->f_mapping,
- i, GFP_USER | __GFP_ZERO);
- if (!page)
+ struct folio *folio;
+
+ folio = __filemap_get_folio(file->f_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_USER | __GFP_ZERO);
+ if (IS_ERR(folio))
break;
- pr_debug("pid(%d) page[%d]->count=%d\n",
- current->pid, i, page_count(page));
- SetPageUptodate(page);
- unlock_page(page);
- ctx->ring_pages[i] = page;
+ pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i,
+ folio_ref_count(folio));
+ folio_end_read(folio, true);
+
+ ctx->ring_folios[i] = folio;
}
ctx->nr_pages = i;
@@ -570,7 +573,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->user_id = ctx->mmap_base;
ctx->nr_events = nr_events; /* trusted copy */
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->nr = nr_events; /* user copy */
ring->id = ~0U;
ring->head = ring->tail = 0;
@@ -578,7 +581,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
return 0;
}
@@ -689,9 +692,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
/* While kioctx setup is in progress,
* we are protected from page migration
- * changes ring_pages by ->ring_lock.
+ * changes ring_folios by ->ring_lock.
*/
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->id = ctx->id;
return 0;
}
@@ -1033,7 +1036,7 @@ static void user_refill_reqs_available(struct kioctx *ctx)
* against ctx->completed_events below will make sure we do the
* safe/right thing.
*/
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
refill_reqs_available(ctx, head, ctx->tail);
@@ -1145,12 +1148,12 @@ static void aio_complete(struct aio_kiocb *iocb)
if (++tail >= ctx->nr_events)
tail = 0;
- ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ ev_page = folio_address(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
*event = iocb->ki_res;
- flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ flush_dcache_folio(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
(void __user *)(unsigned long)iocb->ki_res.obj,
@@ -1163,10 +1166,10 @@ static void aio_complete(struct aio_kiocb *iocb)
ctx->tail = tail;
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
ring->tail = tail;
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
ctx->completed_events++;
if (ctx->completed_events > 1)
@@ -1238,8 +1241,8 @@ static long aio_read_events_ring(struct kioctx *ctx,
sched_annotate_sleep();
mutex_lock(&ctx->ring_lock);
- /* Access to ->ring_pages here is protected by ctx->ring_lock. */
- ring = page_address(ctx->ring_pages[0]);
+ /* Access to ->ring_folios here is protected by ctx->ring_lock. */
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
tail = ring->tail;
@@ -1260,20 +1263,20 @@ static long aio_read_events_ring(struct kioctx *ctx,
while (ret < nr) {
long avail;
struct io_event *ev;
- struct page *page;
+ struct folio *folio;
avail = (head <= tail ? tail : ctx->nr_events) - head;
if (head == tail)
break;
pos = head + AIO_EVENTS_OFFSET;
- page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ folio = ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
avail = min(avail, nr - ret);
avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
- ev = page_address(page);
+ ev = folio_address(folio);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
@@ -1287,9 +1290,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
head %= ctx->nr_events;
}
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->head = head;
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
pr_debug("%li h%u t%u\n", ret, head, tail);
out:
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 0496cb5b6eab..42bd1cb7c9cd 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -149,6 +149,38 @@ struct file *anon_inode_getfile(const char *name,
EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
+ * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
+ * anonymous inode, and a dentry that describe the "class"
+ * of the file
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ * @f_mode: [in] fmode
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfile() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup. Allows setting the fmode. Returns the newly created file* or an error
+ * pointer.
+ */
+struct file *anon_inode_getfile_fmode(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags, fmode_t f_mode)
+{
+ struct file *file;
+
+ file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
+ if (!IS_ERR(file))
+ file->f_mode |= f_mode;
+
+ return file;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);
+
+/**
* anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
* !S_PRIVATE anon inode rather than reuse the
* singleton anon inode and calls the
@@ -271,6 +303,7 @@ int anon_inode_create_getfd(const char *name, const struct file_operations *fops
return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}
+
static int __init anon_inode_init(void)
{
anon_inode_mnt = kern_mount(&anon_inode_fs_type);
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce1..5c180fdc3efb 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
@@ -290,28 +289,27 @@ retry:
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- if (!IS_ERR(acl))
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4ff56fa4d539..534ba2b02bd6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -244,10 +244,10 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 052b2fac25d6..2790e516383d 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -126,13 +126,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
return pos;
}
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
{
- unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
BCH_ALLOC_V4_U64s_V0) +
BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
(sizeof(struct bch_backpointer) / sizeof(u64));
+}
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = alloc_v4_u64s_noerror(a);
BUG_ON(ret > U8_MAX - BKEY_U64s);
return ret;
}
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 114328acde72..af7a71de1bdf 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -49,13 +49,15 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
if (!bch2_dev_exists2(c, bp.k->p.inode))
return 0;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
- bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+ !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)),
c, err,
- backpointer_pos_wrong,
- "backpointer at wrong pos");
+ backpointer_bucket_offset_wrong,
+ "backpointer bucket_offset wrong");
fsck_err:
return ret;
}
@@ -468,7 +470,7 @@ found:
goto err;
}
- bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = p.ptr.offset;
bch2_bio_map(bio, data_buf, bytes);
ret = submit_bio_wait(bio);
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index da012ca7daee..c1b274eadda1 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -45,6 +45,15 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ return POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+}
+
/*
* Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
*/
@@ -53,14 +62,8 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
u64 bucket_offset)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
- struct bpos ret;
-
- ret = POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-
+ struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
return ret;
}
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a31a5f706929..91c3c1fef233 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -709,6 +709,8 @@ struct btree_trans_buf {
x(stripe_delete) \
x(reflink) \
x(fallocate) \
+ x(fsync) \
+ x(dio_write) \
x(discard) \
x(discard_fast) \
x(invalidate) \
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 63102992d955..2e8b1a489c20 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -578,7 +578,8 @@ struct bch_member {
__le64 nbuckets; /* device size */
__le16 first_bucket; /* index of first bucket used */
__le16 bucket_size; /* sectors */
- __le32 pad;
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
__le64 last_mount; /* time_t */
__le64 flags;
@@ -587,8 +588,15 @@ struct bch_member {
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
__le64 seq;
+ __le64 btree_allocated_bitmap;
};
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
+
#define BCH_MEMBER_V1_BYTES 56
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
@@ -876,7 +884,8 @@ struct bch_sb_field_downgrade {
x(rebalance_work, BCH_VERSION(1, 3)) \
x(member_seq, BCH_VERSION(1, 4)) \
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
- x(btree_subvolume_children, BCH_VERSION(1, 6))
+ x(btree_subvolume_children, BCH_VERSION(1, 6)) \
+ x(mi_btree_bitmap, BCH_VERSION(1, 7))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -894,6 +903,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define BCH_SB_SECTOR 8
#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
+#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
+
struct bch_sb_layout {
__uuid_t magic; /* bcachefs superblock UUID */
__u8 layout_type;
@@ -1314,7 +1325,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(write_buffer_keys, 11) \
x(datetime, 12)
-enum {
+enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -1360,7 +1371,7 @@ struct jset_entry_blacklist_v2 {
x(inodes, 1) \
x(key_version, 2)
-enum {
+enum bch_fs_usage_type {
#define x(f, nr) BCH_FS_USAGE_##f = nr,
BCH_FS_USAGE_TYPES()
#undef x
@@ -1501,7 +1512,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)) \
+ BIT_ULL(KEY_TYPE_indirect_inline_data)| \
+ BIT_ULL(KEY_TYPE_error)) \
x(subvolumes, 8, 0, \
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
@@ -1535,6 +1547,20 @@ enum btree_id {
BTREE_ID_NR
};
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index cf23ff47bed8..3a45d128f608 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
}
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
+}
+
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
const struct bkey_packed *k)
{
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..a275a9e8e341 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -171,11 +171,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
+ (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+ bch2_btree_node_type_str(type),
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0, c, err,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 84474324dba9..02c70e813fac 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -709,9 +709,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- u32 seq;
- BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ if (unlikely(level >= BTREE_MAX_DEPTH)) {
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+ level, BTREE_MAX_DEPTH);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
@@ -752,34 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
set_btree_node_read_in_flight(b);
-
six_unlock_write(&b->c.lock);
- seq = six_lock_seq(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- /* Unlock before doing IO: */
- if (path && sync)
- bch2_trans_unlock_noassert(trans);
-
- bch2_btree_node_read(trans, b, sync);
+ if (path) {
+ u32 seq = six_lock_seq(&b->c.lock);
- if (!sync)
- return NULL;
+ /* Unlock before doing IO: */
+ six_unlock_intent(&b->c.lock);
+ bch2_trans_unlock_noassert(trans);
- if (path) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
+ bch2_btree_node_read(trans, b, sync);
- if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- BUG_ON(!path);
+ if (!sync)
+ return NULL;
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ b = NULL;
+ } else {
+ bch2_btree_node_read(trans, b, sync);
+ if (lock_type == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
}
return b;
@@ -1112,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_cache_find(bc, k);
+ struct btree *b = btree_cache_find(bc, k);
if (b)
return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- return PTR_ERR_OR_ZERO(b);
+ if (!IS_ERR_OR_NULL(b))
+ six_unlock_read(&b->c.lock);
+ return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1148,6 +1163,8 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+ goto out;
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1162,7 +1179,7 @@ wait_on_io:
btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
mutex_unlock(&bc->lock);
-
+out:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6280da1244b5..791470b0c654 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -368,11 +368,16 @@ again:
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
- bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
if (ret)
break;
+
+ if (!btree_id_is_alloc(b->c.btree_id)) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+ }
continue;
}
@@ -544,12 +549,12 @@ reconstruct_root:
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}
- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}
@@ -823,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ struct printbuf buf = PRINTBUF;
int ret = 0;
deleted.p = k->k->p;
@@ -843,11 +849,23 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (ret)
goto err;
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+ c, btree_bitmap_not_marked,
+ "btree ptr not marked in member info btree allocated bitmap\n %s",
+ (bch2_bkey_val_to_text(&buf, c, *k),
+ buf.buf))) {
+ mutex_lock(&c->sb_lock);
+ bch2_dev_btree_bitmap_mark(c, *k);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
ret = commit_do(trans, NULL, NULL, 0,
bch2_key_trigger(trans, btree_id, level, old,
unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
@@ -1569,7 +1587,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
- return ret;
+ goto out;
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
@@ -1577,6 +1595,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
ret = bch2_trans_update(trans, iter, new, 0);
}
+out:
fsck_err:
printbuf_exit(&buf);
return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d7de82ac3893..debb0edc3455 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
@@ -840,7 +840,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
- if (k->u64s < bkeyp_key_u64s(&b->format, k))
+ if (!bkeyp_u64s_valid(&b->format, k))
return false;
struct printbuf buf = PRINTBUF;
@@ -884,11 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
- if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
+ if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_u64s,
- "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
+ "bad k->u64s %u (min %u max %zu)", k->u64s,
+ bkeyp_key_u64s(&b->format, k),
+ U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;
if (!write)
@@ -947,13 +949,12 @@ drop_this_key:
* do
*/
- if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
- if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
-
}
/*
@@ -1339,7 +1340,9 @@ start:
rb->start_time);
bio_put(&rb->bio);
- if (saw_error && !btree_node_read_error(b)) {
+ if (saw_error &&
+ !btree_node_read_error(b) &&
+ c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc..1c70836dd7cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- if (!trans->restarted)
- btree_iter_path(trans, iter)->preserve = false;
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -642,7 +647,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 5cbcbfe85235..1e8cf49a6935 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
+static void journal_iter_verify(struct journal_iter *iter)
+{
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ BUG_ON(cmp < 0);
+ }
+}
+
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
@@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
move_gap(keys, idx);
@@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->data + iter->idx;
+ journal_iter_verify(iter);
+
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ if (cmp > 0)
+ break;
+ BUG_ON(cmp);
- while (k < iter->keys->data + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
@@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
@@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
- if (trans->journal_replay_not_finished &&
- !test_bit(BCH_FS_may_go_rw, &trans->c->flags))
- list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 581edcb0911b..7dafa1accec2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
@@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
@@ -840,8 +842,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- scanned += bc->nr_freed_nonpcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -855,11 +855,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_nonpcpu--;
}
- if (scanned >= nr)
- goto out;
-
- scanned += bc->nr_freed_pcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -873,9 +868,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_pcpu--;
}
- if (scanned >= nr)
- goto out;
-
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (bc->shrink_iter >= tbl->size)
@@ -891,12 +883,12 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
ck = container_of(pos, struct bkey_cached, hash);
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
goto next;
-
- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- else if (bkey_cached_lock_for_evict(ck)) {
+ goto next;
+ } else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
}
@@ -914,7 +906,6 @@ next:
} while (scanned < nr && bc->shrink_iter != start);
rcu_read_unlock();
-out:
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
@@ -965,13 +956,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#ifdef __KERNEL__
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
+ if (bc->pcpu_freed) {
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
+
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &items);
+ }
}
}
#endif
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index b9b151e693ed..f2caf491957e 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 3f33be7e5e5c..45cb8149d374 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -57,13 +57,14 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n
bp->v.seq = cpu_to_le64(f->cookie);
bp->v.sectors_written = 0;
bp->v.flags = 0;
+ bp->v.sectors_written = cpu_to_le16(f->sectors_written);
bp->v.min_key = f->min_key;
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
}
static bool found_btree_node_is_readable(struct btree_trans *trans,
- const struct found_btree_node *f)
+ struct found_btree_node *f)
{
struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
@@ -71,8 +72,10 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
- if (ret)
+ if (ret) {
+ f->sectors_written = b->written;
six_unlock_read(&b->c.lock);
+ }
/*
* We might update this node's range; if that happens, we need the node
@@ -133,6 +136,19 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ struct nonce nonce = btree_nonce(&bn->keys, 0);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+ }
+
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
+ if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+ return;
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -192,8 +208,13 @@ static int read_btree_nodes_worker(void *p)
last_print = jiffies;
}
- try_read_btree_node(w->f, ca, bio, buf,
- bucket * ca->mi.bucket_size + bucket_offset);
+ u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+ continue;
+
+ try_read_btree_node(w->f, ca, bio, buf, sector);
}
err:
bio_put(bio);
@@ -213,6 +234,9 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;
@@ -281,6 +305,8 @@ again:
start->max_key = bpos_predecessor(n->min_key);
start->range_updated = true;
+ } else if (n->level) {
+ n->overwritten = true;
} else {
struct printbuf buf = PRINTBUF;
@@ -290,7 +316,7 @@ again:
found_btree_node_to_text(&buf, c, n);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -1;
+ return -BCH_ERR_fsck_repair_unimplemented;
}
}
@@ -436,6 +462,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
struct find_btree_nodes *f = &c->found_btree_nodes;
int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
index abb7b27d556a..5cfaeb5ac831 100644
--- a/fs/bcachefs/btree_node_scan_types.h
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -9,6 +9,7 @@ struct found_btree_node {
bool overwritten:1;
u8 btree_id;
u8 level;
+ unsigned sectors_written;
u32 seq;
u64 cookie;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index aa9da4970740..bbec91e8e650 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
+ unsigned watermark = flags & BCH_WATERMARK_MASK;
EBUG_ON(path->level);
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+ if (watermark < BCH_WATERMARK_reclaim &&
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -499,9 +500,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+ unsigned btree_id_start)
{
- struct btree_insert_entry *i;
bool trans_trigger_run;
int ret, overwrite;
@@ -514,13 +514,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
do {
trans_trigger_run = false;
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ for (unsigned i = btree_id_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
i++) {
- if (i->btree_id != btree_id)
+ if (trans->updates[i].btree_id != btree_id)
continue;
- ret = run_one_trans_trigger(trans, i, overwrite);
+ ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
if (ret < 0)
return ret;
if (ret)
@@ -534,8 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *btree_id_start = trans->updates;
- unsigned btree_id = 0;
+ unsigned btree_id = 0, btree_id_start = 0;
int ret = 0;
/*
@@ -549,8 +548,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
+ while (btree_id_start < trans->nr_updates &&
+ trans->updates[btree_id_start].btree_id < btree_id)
btree_id_start++;
ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +557,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
return ret;
}
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
+
if (i->btree_id > BTREE_ID_alloc)
break;
if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
if (ret)
return ret;
break;
@@ -826,7 +827,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
struct bch_fs *c = trans->c;
int ret = 0, u64s_delta = 0;
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
if (i->cached)
continue;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9404d96c38f3..c69b233c41bb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -321,9 +321,9 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
+ unsigned long btree_trans_barrier_seq;
u16 u64s;
bool valid;
- u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -364,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 32397b99752f..b4efd8cc4d1a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -21,14 +21,15 @@
#include "keylist.h"
#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-members.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
-const char * const bch2_btree_update_modes[] = {
+static const char * const bch2_btree_update_modes[] = {
#define x(t) #t,
- BCH_WATERMARKS()
+ BTREE_UPDATE_MODES()
#undef x
NULL
};
@@ -605,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as,
bch2_keylist_push(keys);
}
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+ for_each_keylist_key(&as->new_keys, k)
+ if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+ return false;
+ return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->sb_lock);
+ for_each_keylist_key(&as->new_keys, k)
+ bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/*
* The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info:
@@ -662,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
+ if (!btree_update_new_nodes_marked_sb(as))
+ btree_update_new_nodes_mark_sb(as);
+
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -704,9 +728,13 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
- if (as->b) {
-
- b = as->b;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -850,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -1100,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1108,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1123,34 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
- unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
- if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
-
- ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ bch2_trans_unlock(trans);
+ wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
+ ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1158,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1176,14 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->ip_started = _RET_IP_;
- as->mode = BTREE_UPDATE_none;
- as->watermark = watermark;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1277,23 +1304,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ bool nofail)
{
struct bch_fs *c = as->c;
- struct btree *old;
trace_and_count(c, btree_node_set_root, trans, b);
- old = btree_node_root(c, b);
+ struct btree *old = btree_node_root(c, b);
/*
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ if (nofail) {
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ } else {
+ int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+ if (ret)
+ return ret;
+ }
bch2_btree_set_root_inmem(c, b);
@@ -1307,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old);
+ return 0;
}
/* Interior node updates: */
@@ -1373,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1534,7 +1568,7 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
BUG_ON(bch2_btree_node_check_topology(trans, b));
}
@@ -1649,15 +1683,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- if (ret)
- goto err;
} else if (n3) {
- bch2_btree_set_root(as, trans, trans->paths + path, n3);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, trans->paths + path, n1);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
+ if (ret)
+ goto err;
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1714,27 +1749,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1755,7 +1769,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1784,7 +1799,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
return ret;
}
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1798,6 +1819,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
BUG_ON(bch2_btree_node_check_topology(trans, b));
@@ -1807,7 +1829,7 @@ split:
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
@@ -1873,7 +1895,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
bch2_keylist_add(&as->parent_keys, &b->key);
btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
- bch2_btree_set_root(as, trans, path, n);
+ int ret = bch2_btree_set_root(as, trans, path, n, true);
+ BUG_ON(ret);
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
@@ -1926,6 +1950,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
+ /*
+ * Work around a deadlock caused by the btree write buffer not doing
+ * merges and leaving tons of merges for us to do - we really don't need
+ * to be doing merges at all from the interior update path, and if the
+ * interior update path is generating too many new interior updates we
+ * deadlock:
+ */
+ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+ return 0;
+
+ if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
+ flags &= ~BCH_WATERMARK_MASK;
+ flags |= BCH_WATERMARK_btree;
+ flags |= BCH_TRANS_COMMIT_journal_reclaim;
+ }
+
b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -2071,6 +2111,10 @@ err:
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ ret = 0;
+ if (!ret)
+ ret = bch2_trans_relock(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
@@ -2116,12 +2160,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
- if (ret)
- goto err;
} else {
- bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+ ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
}
+ if (ret)
+ goto err;
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -2519,9 +2564,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
- prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
(void *) as->ip_started,
bch2_btree_id_str(as->btree_id),
+ as->update_level_start,
+ as->update_level_end,
bch2_watermarks[as->watermark],
bch2_btree_update_modes[as->mode],
as->nodes_written,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 88dcf5a22a3b..c1a479ebaad1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -57,7 +57,8 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index baf63e2fddb6..36a6f42aba5e 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -316,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
write_locked = false;
+
+ ret = lockrestart_do(trans,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_foreground_maybe_merge(trans, iter.path, 0,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc));
+ if (ret)
+ goto err;
}
}
@@ -382,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
+ BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 941401a210f5..82f179258867 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -525,7 +525,6 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type))) {
- BUG();
ret = -EIO;
goto err;
}
@@ -629,7 +628,6 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- BUG();
ret = -EIO;
goto err;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 00aaf4bb5139..f9af5adabe83 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -395,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
: "(invalid data type)";
}
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
- if (type < BCH_DATA_NR)
- prt_str(out, __bch2_data_types[type]);
- else
- prt_printf(out, "(invalid data type %u)", type);
-}
-
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbfa6459bdbc..4d14f19f5185 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
- char **devs;
- size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
-
- if (IS_ERR(c))
- return PTR_ERR(c);
+ struct bch_fs *c = thr->c;
- int ret = 0;
- if (test_bit(BCH_FS_errors_fixed, &c->flags))
- ret |= 1;
- if (test_bit(BCH_FS_error, &c->flags))
- ret |= 4;
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
- bch2_fs_stop(c);
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
- if (ret & 1)
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- if (ret & 4)
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
return ret;
}
@@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
+ darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
}
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
}
+ thr->opts = bch2_opts_empty();
+
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
@@ -230,15 +229,28 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 4701457f6381..088fd2e7bdf1 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -102,6 +102,7 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
int ret;
skcipher_request_set_sync_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
@@ -429,15 +430,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo,
- bch2_csum_types[crc_old.csum_type],
- bch2_csum_types[new_csum_type]);
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo);
+ bch2_prt_csum_type(&buf, crc_old.csum_type);
+ prt_str(&buf, " new type ");
+ bch2_prt_csum_type(&buf, new_csum_type);
+ prt_str(&buf, ")");
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016dc..e40499fde9a4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
struct bch_csum expected,
struct bch_csum got)
{
- prt_printf(out, "checksum error: got ");
+ prt_str(out, "checksum error, type ");
+ bch2_prt_csum_type(out, type);
+ prt_str(out, ": got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
- prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb45570f..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
- if (type < BCH_COMPRESSION_TYPE_NR)
- prt_str(out, __bch2_compression_types[type]);
- else
- prt_printf(out, "(invalid compression type %u)", type);
-}
-
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 34731ee0217f..0022b51ce3c0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ !durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
@@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
- BUG_ON(!m->op.nr_replicas);
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 208ce6f0fc43..cd99b7399414 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 082075244e16..556a217108d3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -131,29 +131,33 @@ fsck_err:
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+ struct bch_stripe s = {};
+
+ memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+ unsigned nr_data = s.nr_blocks - s.nr_redundant;
+
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+ s.algorithm,
+ le16_to_cpu(s.sectors),
+ nr_data,
+ s.nr_redundant);
+ bch2_prt_csum_type(out, s.csum_type);
+ prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+ for (unsigned i = 0; i < s.nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+ if ((void *) ptr >= bkey_val_end(k))
+ break;
+
+ bch2_extent_ptr_to_text(out, c, ptr);
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
- s->algorithm,
- le16_to_cpu(s->sectors),
- nr_data,
- s->nr_redundant,
- s->csum_type,
- 1U << s->csum_granularity_bits);
-
- for (i = 0; i < s->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = s->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
- if (i < nr_data)
- prt_printf(out, "#%u", stripe_blockcount_get(s, i));
- prt_printf(out, " gen %u", ptr->gen);
- if (ptr_stale(ca, ptr))
- prt_printf(out, " stale");
+ if (s.csum_type < BCH_CSUM_NR &&
+ i < nr_data &&
+ stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+ prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
}
}
@@ -607,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
- prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
- want.hi, want.lo,
- got.hi, got.lo,
- bch2_csum_types[v->csum_type]);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e805..f042616888b0 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
+ EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 01a79fa3eacb..dbe35b80bc0b 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -175,6 +175,7 @@
x(EINVAL, block_size_too_small) \
x(EINVAL, bucket_size_too_small) \
x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_size_too_big) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0e3ca99fbd2d..1a331e539204 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -998,7 +998,9 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_str(out, " cached");
if (ptr->unwritten)
prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
+ if (b >= ca->mi.first_bucket &&
+ b < ca->mi.nbuckets &&
+ ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
}
@@ -1028,11 +1030,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
crc.compressed_size,
crc.uncompressed_size,
- crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type]);
+ crc.offset, crc.nonce);
+ bch2_prt_csum_type(out, crc.csum_type);
+ prt_str(out, " compress ");
bch2_prt_compression_type(out, crc.compression_type);
break;
}
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
index 4ce5e957a6e9..0f955c3c76a7 100644
--- a/fs/bcachefs/eytzinger.c
+++ b/fs/bcachefs/eytzinger.c
@@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n)
struct wrapper {
cmp_func_t cmp;
- swap_func_t swap;
+ swap_func_t swap_func;
};
/*
@@ -125,7 +125,7 @@ struct wrapper {
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
- ((const struct wrapper *)priv)->swap(a, b, (int)size);
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
return;
}
@@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
- if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
@@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
{
struct wrapper w = {
.cmp = cmp_func,
- .swap = swap_func,
+ .swap_func = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index ee0e2df33322..24840aee335c 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) = eytzinger0_next((_i), (_size)))
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
-static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
- return eytzinger0_next(idx, size);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index f49e6c0f0f68..b889370a5088 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
+ bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ return -EROFS;
+
inode_lock(&inode->v);
ret = generic_write_checks(req, iter);
if (unlikely(ret <= 0))
- goto err;
+ goto err_put_write_ref;
ret = file_remove_privs(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
ret = file_update_time(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
+ goto err_put_write_ref;
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
@@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
ret = bch2_dio_write_loop(dio);
-err:
+out:
if (locked)
inode_unlock(&inode->v);
return ret;
@@ -653,7 +658,9 @@ err_put_bio:
bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
- goto err;
+err_put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ goto out;
}
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a0c..20b40477425f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_info *inode)
{
- struct bch_inode_unpacked u;
- int ret;
-
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
- if (ret)
- return ret;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ return -EROFS;
- return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
- bch2_inode_flush_nocow_writes(c, inode);
+ struct bch_inode_unpacked u;
+ int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ return ret;
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b5ea9fa1259d..65b04b3c2679 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -188,7 +188,8 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old);
if (unlikely(old != inode)) {
- discard_new_inode(&inode->v);
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
@@ -225,8 +226,10 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
if (unlikely(!inode)) {
int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
- if (ret && inode)
- discard_new_inode(&inode->v);
+ if (ret && inode) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ }
if (ret)
return ERR_PTR(ret);
}
@@ -841,6 +844,9 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
+ stat->subvol = inode->ei_subvol;
+ stat->result_mask |= STATX_SUBVOL;
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
@@ -961,7 +967,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
@@ -971,6 +976,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (ret)
return ret;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index ca4a066e9a54..0f95d7fb5ec0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -606,7 +606,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bkey_s new,
unsigned flags)
{
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
if (nr) {
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f137252bccc5..40d7df7607df 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -199,9 +199,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
u64 new_i_size,
s64 i_sectors_delta)
{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
@@ -214,25 +211,36 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* lost, but that's fine.
*/
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ret;
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
+ /*
+ * varint_decode_fast(), in the inode .invalid method, reads up to 7
+ * bytes past the end of the buffer:
+ */
+ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+ ret = PTR_ERR_OR_ZERO(k_mut);
+ if (unlikely(ret))
+ goto err;
+
+ bkey_reassemble(k_mut, k);
+
+ if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+ k_mut = bch2_inode_to_v3(trans, k_mut);
+ ret = PTR_ERR_OR_ZERO(k_mut);
if (unlikely(ret))
goto err;
}
- inode = bkey_i_to_inode_v3(k);
+ struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
@@ -1505,6 +1513,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
+ memset(&op->failed, 0, sizeof(op->failed));
+
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
op->flags |= BCH_WRITE_DONE;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9c9a25dbd613..a8b08e76d0d0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -706,6 +706,12 @@ recheck_need_open:
spin_unlock(&j->lock);
+ /*
+ * We're called from bch2_journal_flush_seq() -> wait_event();
+ * but this might block. We won't usually block, so we won't
+ * livelock:
+ */
+ sched_annotate_sleep();
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
@@ -870,6 +876,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
{
struct journal_buf *ret = NULL;
+ /* We're inside wait_event(), but using mutex_lock(: */
+ sched_annotate_sleep();
mutex_lock(&j->buf_lock);
spin_lock(&j->lock);
max_seq = min(max_seq, journal_cur_seq(j));
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 725fcf46f631..eb1f9d6f5a19 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -247,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out,
if (entry) {
prt_str(out, " type=");
- prt_str(out, bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
}
if (!jset) {
@@ -403,7 +403,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
+ prt_str(out, ": ");
}
prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -563,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- prt_printf(out, "type=%s v=%llu",
- bch2_fs_usage_types[u->entry.btree_id],
- le64_to_cpu(u->v));
+ prt_str(out, "type=");
+ bch2_prt_fs_usage_type(out, u->entry.btree_id);
+ prt_printf(out, " v=%llu", le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
@@ -827,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
+ bch2_prt_jset_entry_type(out, entry->type);
+
if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_str(out, ": ");
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- } else {
- prt_printf(out, "(unknown type %u)", entry->type);
}
}
@@ -1722,7 +1723,7 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
-static CLOSURE_CALLBACK(do_journal_write)
+static CLOSURE_CALLBACK(journal_write_submit)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
@@ -1767,6 +1768,44 @@ static CLOSURE_CALLBACK(do_journal_write)
continue_at(cl, journal_write_done, j->wq);
}
+static CLOSURE_CALLBACK(journal_write_preflush)
+{
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ spin_lock(&j->lock);
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
+
+ if (w->separate_flush) {
+ for_each_rw_member(c, ca) {
+ percpu_ref_get(&ca->io_ref);
+
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+
+ continue_at(cl, journal_write_submit, j->wq);
+ } else {
+ /*
+ * no need to punt to another work item if we're not waiting on
+ * preflushes
+ */
+ journal_write_submit(&cl->work);
+ }
+}
+
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -2032,23 +2071,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data))
- closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
-
- if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
-
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
-
- continue_at(cl, do_journal_write, j->wq);
+ continue_at(cl, journal_write_preflush, j->wq);
+ else
+ continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, j->wq);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ab811c0dad26..04a577848b01 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j)
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8c053cb64ca5..b5161b5d76a0 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -134,6 +134,7 @@ enum journal_flags {
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
JOURNAL_NEED_FLUSH_WRITE,
+ JOURNAL_SPACE_LOW,
};
/* Reasons we may fail to get a journal reservation: */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bf68ea49447b..4d94b7742dbb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -968,24 +968,30 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg,
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+/*
+ * Ancient versions of bcachefs produced packed formats which could represent
+ * keys that the in memory format cannot represent; this checks for those
+ * formats so we can get rid of them.
+ */
static bool bformat_needs_redo(struct bkey_format *f)
{
- unsigned i;
-
- for (i = 0; i < f->nr_fields; i++) {
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ unsigned f_bits = f->bits_per_field[i];
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (f->bits_per_field[i] > unpacked_bits)
+ if (f_bits > unpacked_bits)
return true;
- if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ if ((f_bits == unpacked_bits) && field_offset)
return true;
- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
- unpacked_mask) <
- field_offset)
+ u64 f_mask = f_bits
+ ? ~((~0ULL << (f_bits - 1)) << 1)
+ : 0;
+
+ if (((field_offset + f_mask) & unpacked_mask) < field_offset)
return true;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e1800c4119b5..bb068fd72465 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -43,7 +43,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -53,7 +53,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -83,18 +83,39 @@ const char * const bch2_member_states[] = {
NULL
};
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
BCH_JSET_ENTRY_TYPES()
NULL
};
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
BCH_FS_USAGE_TYPES()
NULL
};
#undef x
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+ unsigned nr, const char *type, unsigned idx)
+{
+ if (idx < nr)
+ prt_str(out, opts[idx]);
+ else
+ prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
+void bch2_prt_##name(struct printbuf *out, type t) \
+{ \
+ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
{
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1ac4135cca1c..84e452835a17 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+
static inline const char *bch2_d_type_str(unsigned d_type)
{
return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e68b34eab90a..556da0738106 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -560,13 +560,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_snapshot_tree s_t;
- int ret;
+ u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+ int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__,
- snapshot_t(c, k.k->p.snapshot)->tree);
+ "%s: snapshot tree %u not found", __func__, tree);
if (ret)
return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b76c16152579..8091d0686029 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
}
}
-static bool btree_id_is_alloc(enum btree_id id)
-{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
- }
-}
-
/* for -o reconstruct_alloc: */
static void bch2_reconstruct_alloc(struct bch_fs *c)
{
@@ -263,7 +249,10 @@ int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- replay_now_at(j, k->journal_seq);
+ if (k->journal_seq)
+ replay_now_at(j, k->journal_seq);
+ else
+ replay_now_at(j, j->replay_journal_seq_end);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
@@ -913,7 +902,8 @@ out:
bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
}
- kfree(clean);
+ if (!IS_ERR(clean))
+ kfree(clean);
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index cb501460d615..0cec0f7d9703 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -44,7 +44,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || c->opts.fsck || !c->sb.clean)
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
return bch2_fs_read_write_early(c);
return 0;
}
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 5980ba2563fe..194e55b11137 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -29,6 +29,14 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if (vstruct_end(entry) > vstruct_end(&clean->field)) {
+ bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
+ le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
+ (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
+ bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
@@ -270,6 +278,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
return -BCH_ERR_invalid_sb_clean;
}
+ for (struct jset_entry *entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
+ prt_str(err, "entry type ");
+ bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type));
+ prt_str(err, " overruns end of section");
+ return -BCH_ERR_invalid_sb_clean;
+ }
+ }
+
return 0;
}
@@ -287,6 +306,9 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
+ break;
+
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index d6f81179c3a2..a98ef940b7a3 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -51,7 +51,10 @@
BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
x(btree_subvolume_children, \
BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
- BCH_FSCK_ERR_subvol_children_not_set)
+ BCH_FSCK_ERR_subvol_children_not_set) \
+ x(mi_btree_bitmap, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_btree_bitmap_not_marked)
#define DOWNGRADE_TABLE()
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index d7d609131030..06c7a644f4a4 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -130,7 +130,7 @@
x(bucket_gens_nonzero_for_invalid_buckets, 122) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
x(need_discard_freespace_key_bad, 124) \
- x(backpointer_pos_wrong, 125) \
+ x(backpointer_bucket_offset_wrong, 125) \
x(backpointer_to_missing_device, 126) \
x(backpointer_to_missing_alloc, 127) \
x(backpointer_to_missing_ptr, 128) \
@@ -270,7 +270,9 @@
x(btree_ptr_v2_min_key_bad, 262) \
x(btree_root_unreadable_and_scan_found_nothing, 263) \
x(snapshot_node_missing, 264) \
- x(dup_backpointer_to_bad_csum_extent, 265)
+ x(dup_backpointer_to_bad_csum_extent, 265) \
+ x(btree_bitmap_not_marked, 266) \
+ x(sb_clean_entry_overrun, 267)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..44b3f0cb7b49 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
#include "opts.h"
#include "replicas.h"
@@ -123,9 +124,9 @@ static int validate_member(struct printbuf *err,
struct bch_sb *sb,
int i)
{
- if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
return -BCH_ERR_invalid_sb_members;
}
@@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
+ ptr->offset, btree_sectors(c)))
+ return false;
+ return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ for (unsigned bit = start >> m->btree_bitmap_shift;
+ (u64) bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a94183271..5bf27d30ca29 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
+#include "bkey_types.h"
extern char * const bch2_member_error_strs[];
@@ -106,10 +107,10 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->ref);
- rcu_read_lock();
if ((ca = __bch2_next_dev(c, ca, NULL)))
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -131,10 +132,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->io_ref);
- rcu_read_lock();
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
@@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
+ .btree_bitmap_shift = mi->btree_bitmap_shift,
+ .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
}
@@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+ u64 end = start + sectors;
+
+ if (end > 64ULL << ca->mi.btree_bitmap_shift)
+ return false;
+
+ for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
+ (u64) bit << ca->mi.btree_bitmap_shift < end;
+ bit++)
+ if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+ return false;
+ return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0e806f04f3d7..544322d5c251 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
bool ret;
@@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
out:
rcu_read_unlock();
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5eee055ee272..bfdb15e7d778 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -232,7 +232,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return NULL;
}
}
@@ -649,7 +649,7 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) {
prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
bytes, 512UL << sb->sb->layout.sb_max_size_bits);
return -BCH_ERR_invalid_sb_too_big;
@@ -700,8 +700,11 @@ retry:
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name)
- return -ENOMEM;
+ if (!sb->sb_name) {
+ ret = -ENOMEM;
+ prt_printf(&err, "error allocating memory for sb_name");
+ goto err;
+ }
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
@@ -920,6 +923,7 @@ int bch2_write_super(struct bch_fs *c)
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+ DARRAY(struct bch_dev *) online_devices = {};
int ret = 0;
trace_and_count(c, write_super, c, _RET_IP_);
@@ -932,6 +936,15 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ for_each_online_member(c, ca) {
+ ret = darray_push(&online_devices, ca);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
+ percpu_ref_put(&ca->io_ref);
+ goto out;
+ }
+ percpu_ref_get(&ca->io_ref);
+ }
+
/* Make sure we're using the new magic numbers: */
c->disk_sb.sb->magic = BCHFS_MAGIC;
c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
@@ -939,8 +952,8 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- for_each_online_member(c, ca)
- __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ darray_for_each(online_devices, ca)
+ __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
if (test_bit(BCH_FS_error, &c->flags))
@@ -956,16 +969,15 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(c, ca)
- bch2_sb_from_fs(c, ca);
+ darray_for_each(online_devices, ca)
+ bch2_sb_from_fs(c, (*ca));
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- percpu_ref_put(&ca->io_ref);
goto out;
}
}
@@ -992,16 +1004,18 @@ int bch2_write_super(struct bch_fs *c)
return -BCH_ERR_sb_not_downgraded;
}
- for_each_online_member(c, ca) {
- __set_bit(ca->dev_idx, sb_written.d);
- ca->sb_write_error = 0;
+ darray_for_each(online_devices, ca) {
+ __set_bit((*ca)->dev_idx, sb_written.d);
+ (*ca)->sb_write_error = 0;
}
- for_each_online_member(c, ca)
- read_back_super(c, ca);
+ darray_for_each(online_devices, ca)
+ read_back_super(c, *ca);
closure_sync(cl);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
+
if (ca->sb_write_error)
continue;
@@ -1028,17 +1042,20 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(c, ca)
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
wrote = true;
}
+ }
closure_sync(cl);
sb++;
} while (wrote);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1074,6 +1091,9 @@ int bch2_write_super(struct bch_fs *c)
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ darray_for_each(online_devices, ca)
+ percpu_ref_put(&(*ca)->io_ref);
+ darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
}
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ed63018f21be..dddf57ec4511 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sshutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
@@ -539,6 +544,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -1953,6 +1959,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
goto err;
}
+ if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
+ bch_err(ca, "New device size too big (%llu greater than max %u)",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ ret = -BCH_ERR_device_size_too_big;
+ goto err;
+ }
+
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
@@ -1998,13 +2011,9 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- if (!strcmp(name, ca->name)) {
- rcu_read_unlock();
+ for_each_member_device(c, ca)
+ if (!strcmp(name, ca->name))
return ca;
- }
- rcu_read_unlock();
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index ec784d975f66..11bcef170c2c 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -37,6 +37,8 @@ struct bch_member_cpu {
u8 durability;
u8 freespace_initialized;
u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
};
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c86a93a8d8fc..5be92fe3f4ea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -26,6 +25,7 @@
#include "ec.h"
#include "inode.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
@@ -139,6 +139,7 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
write_attribute(prune_cache);
write_attribute(btree_wakeup);
rw_attribute(btree_gc_periodic);
@@ -166,7 +167,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
@@ -415,9 +415,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
bch2_btree_cache_to_text(out, c);
@@ -505,7 +502,7 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_rw, &c->flags))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -538,6 +535,11 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +560,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -639,7 +642,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
@@ -657,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_flush,
&sysfs_prune_cache,
&sysfs_btree_wakeup,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc57747..bfec656f94c0 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 940db15d6a93..b1af7ac430f6 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -294,16 +294,27 @@ static int thread_with_stdio_fn(void *arg)
return 0;
}
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
+void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
+}
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
+{
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
}
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ bch2_thread_with_stdio_init(thr, ops);
+
+ return __bch2_run_thread_with_stdio(thr);
+}
+
int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index af54ea8f5b0f..1d63d14d7dca 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -63,6 +63,9 @@ struct thread_with_stdio {
const struct thread_with_stdio_ops *ops;
};
+void bch2_thread_with_stdio_init(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b7e7c29278fc..5cf885b09986 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi
#endif
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
@@ -795,7 +803,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
static inline void __clear_bit_le64(size_t bit, __le64 *addr)
{
- addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
}
static inline bool test_bit_le64(size_t bit, __le64 *addr)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c1e6a5bbeeaf..58110c968667 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2776,20 +2776,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kvmalloc(alloc_bytes, GFP_KERNEL);
+ data = kvzalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
- if (total_bytes >= sizeof(*data)) {
+ if (total_bytes >= sizeof(*data))
data->bytes_left = total_bytes - sizeof(*data);
- data->bytes_missing = 0;
- } else {
+ else
data->bytes_missing = sizeof(*data) - total_bytes;
- data->bytes_left = 0;
- }
-
- data->elem_cnt = 0;
- data->elem_missed = 0;
return data;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index dd6f566a383f..121ab890bd05 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_record_root_in_trans(trans, node->root);
+ if (ret)
+ return ret;
ret = btrfs_update_delayed_inode(trans, node->root, path, node);
return ret;
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index beedd6ed64d3..257d044bca91 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3464,6 +3464,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root_id != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_ref generic_ref = { 0 };
+ /*
+ * Assert that the extent buffer is not cleared due to
+ * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
+ * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
+ * detail.
+ */
+ ASSERT(btrfs_header_bytenr(buf) != 0);
+
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent,
btrfs_header_owner(buf));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 61594eaf1f89..2776112dbdf8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -681,31 +681,21 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
gfp_t extra_gfp)
{
+ const gfp_t gfp = GFP_NOFS | extra_gfp;
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp,
- nr_pages, page_array);
-
- if (allocated == nr_pages)
- return 0;
-
- /*
- * During this iteration, no page could be allocated, even
- * though alloc_pages_bulk_array() falls back to alloc_page()
- * if it could not bulk-allocate. So we must be out of memory.
- */
- if (allocated == last) {
+ allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+ if (unlikely(allocated == last)) {
+ /* No progress, fail and do cleanup. */
for (int i = 0; i < allocated; i++) {
__free_page(page_array[i]);
page_array[i] = NULL;
}
return -ENOMEM;
}
-
- memalloc_retry_wait(GFP_NOFS);
}
return 0;
}
@@ -4154,7 +4144,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
* The actual zeroout of the buffer will happen later in
* btree_csum_one_bio.
*/
- if (btrfs_is_zoned(fs_info)) {
+ if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
return;
}
@@ -4193,6 +4183,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 445f7716f1e2..24a048210b15 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -817,7 +817,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
split->orig_start = em->orig_start;
} else {
- const u64 diff = start + len - em->start;
+ const u64 diff = end - em->start;
split->block_len = split->len;
split->block_start += diff;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f9d76072398d..1640c46f2153 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3719,8 +3719,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_CAN_ODIRECT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3850,6 +3849,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 37701531eeb1..f454ba349683 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1145,13 +1145,13 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
0, *alloc_hint, &ins, 1, 1);
if (ret) {
/*
- * Here we used to try again by going back to non-compressed
- * path for ENOSPC. But we can't reserve space even for
- * compressed size, how could it work for uncompressed size
- * which requires larger size? So here we directly go error
- * path.
+ * We can't reserve contiguous space for the compressed size.
+ * Unlikely, but it's possible that we could have enough
+ * non-contiguous space for the uncompressed size instead. So
+ * fall back to uncompressed.
*/
- goto out_free;
+ submit_uncompressed_range(inode, async_extent, locked_page);
+ goto done;
}
/* Here we're doing allocation and writeback of the compressed pages */
@@ -1203,7 +1203,6 @@ done:
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
-out_free:
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
@@ -2533,7 +2532,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len, false);
+ btrfs_delalloc_release_metadata(inode, len, true);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
@@ -4503,6 +4502,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
u64 root_flags;
+ u64 qgroup_reserved = 0;
int ret;
down_write(&fs_info->subvol_sem);
@@ -4547,12 +4547,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
goto out_undead;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release;
}
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -4611,7 +4619,9 @@ out_end_trans:
ret = btrfs_end_transaction(trans);
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(root, &block_rsv);
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
@@ -8779,6 +8789,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 294e31edec9d..0493272a7668 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
int ret;
dev_t anon_dev;
u64 objectid;
+ u64 qgroup_reserved = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
trans_num_items, false);
if (ret)
goto out_new_inode_args;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(root, &block_rsv);
- goto out_new_inode_args;
+ goto out_release_rsv;
}
+ ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+ if (ret)
+ goto out;
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
/* Tree log can't currently deal with an inode which is a new root. */
@@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(root, &block_rsv);
-
btrfs_end_transaction(trans);
+out_release_rsv:
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
@@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_pending_snapshot *pending_snapshot;
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv *block_rsv;
+ u64 qgroup_reserved = 0;
int ret;
/* We do not support snapshotting right now. */
@@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
- btrfs_init_block_rsv(&pending_snapshot->block_rsv,
- BTRFS_BLOCK_RSV_TEMP);
+ block_rsv = &pending_snapshot->block_rsv;
+ btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* 1 to add dir item
* 1 to add dir index
* 1 to update parent inode item
*/
trans_num_items = create_subvol_num_items(inherit) + 3;
- ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv,
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
trans_num_items, false);
if (ret)
goto free_pending;
+ qgroup_reserved = block_rsv->qgroup_rsv_reserved;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
ret = PTR_ERR(trans);
goto fail;
}
+ ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto fail;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->pending_snapshot = pending_snapshot;
@@ -881,7 +898,9 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+ btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -3739,15 +3758,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
goto drop_write;
}
- down_write(&fs_info->subvol_sem);
-
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+ down_write(&fs_info->subvol_sem);
ret = btrfs_quota_enable(fs_info, sa);
+ up_write(&fs_info->subvol_sem);
break;
case BTRFS_QUOTA_CTL_DISABLE:
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent
+ * relocation, because relocation may be building backrefs for
+ * blocks of the quota root while we are deleting the root. This
+ * is like dropping fs roots of deleted snapshots/subvolumes, we
+ * need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to
+ * disable quotas, because we will unlock and relock
+ * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ *
+ * We take this here because we have the dependency of
+ *
+ * inode_lock -> subvol_sem
+ *
+ * because of rename. With relocation we can prealloc extents,
+ * so that makes the dependency chain
+ *
+ * cleaner_mutex -> inode_lock -> subvol_sem
+ *
+ * so we must take the cleaner_mutex here before we take the
+ * subvol_sem. The deadlock can't actually happen, but this
+ * quiets lockdep.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+ down_write(&fs_info->subvol_sem);
ret = btrfs_quota_disable(fs_info);
+ up_write(&fs_info->subvol_sem);
+ mutex_unlock(&fs_info->cleaner_mutex);
break;
default:
ret = -EINVAL;
@@ -3755,7 +3802,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
}
kfree(sa);
- up_write(&fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index c96dd66fd0f7..210d9c82e2ae 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -7,7 +7,7 @@
#ifdef CONFIG_PRINTK
-#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_PREFACE " state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1)
/*
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b749ba45da2b..c2a42bcde98e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1188,6 +1188,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
ordered->disk_num_bytes -= len;
+ ordered->ram_bytes -= len;
if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
ASSERT(ordered->bytes_left == 0);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5f90f0605b12..40e5f7f2fcb7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1342,16 +1342,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
lockdep_assert_held_write(&fs_info->subvol_sem);
/*
- * Lock the cleaner mutex to prevent races with concurrent relocation,
- * because relocation may be building backrefs for blocks of the quota
- * root while we are deleting the root. This is like dropping fs roots
- * of deleted snapshots/subvolumes, we need the same protection.
- *
- * This also prevents races between concurrent tasks trying to disable
- * quotas, because we will unlock and relock qgroup_ioctl_lock across
- * BTRFS_FS_QUOTA_ENABLED changes.
+ * Relocation will mess with backrefs, so make sure we have the
+ * cleaner_mutex held to protect us from relocate.
*/
- mutex_lock(&fs_info->cleaner_mutex);
+ lockdep_assert_held(&fs_info->cleaner_mutex);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
@@ -1373,9 +1367,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * We have nothing held here and no trans handle, just return the error
+ * if there is one.
+ */
ret = flush_reservations(fs_info);
if (ret)
- goto out_unlock_cleaner;
+ return ret;
/*
* 1 For the root item
@@ -1439,9 +1437,6 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_commit_transaction(trans);
-out_unlock_cleaner:
- mutex_unlock(&fs_info->cleaner_mutex);
-
return ret;
}
@@ -3050,6 +3045,8 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -4495,6 +4492,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ if (!sb_rdonly(fs_info->sb))
+ add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
/*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 4bb538a372ce..7007f9e0c972 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
}
return ret;
}
-
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- u64 qgroup_to_release;
-
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
- btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
-}
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index 6f929cf3bd49..8f5739e732b9 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -18,8 +18,6 @@ struct btrfs_trans_handle;
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence,
const struct fscrypt_str *name);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fa25004ab04e..4b22cfe9a98c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1012,6 +1012,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
struct btrfs_fs_info *fs_info = sctx->fs_info;
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
+ unsigned long repaired;
int mirror;
int i;
@@ -1078,16 +1079,15 @@ out:
* Submit the repaired sectors. For zoned case, we cannot do repair
* in-place, but queue the bg to be relocated.
*/
- if (btrfs_is_zoned(fs_info)) {
- if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
+ stripe->nr_sectors);
+ if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
+ if (btrfs_is_zoned(fs_info)) {
btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
- } else if (!sctx->readonly) {
- unsigned long repaired;
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- wait_scrub_stripe_io(stripe);
+ } else {
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ wait_scrub_stripe_io(stripe);
+ }
}
scrub_stripe_report_errors(sctx, stripe);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 253cce7ffecf..47b5d301038e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -847,6 +847,11 @@ static int test_case_7(struct btrfs_fs_info *fs_info)
goto out;
}
+ if (em->block_start != SZ_32K + SZ_4K) {
+ test_err("em->block_start is %llu, expected 36K", em->block_start);
+ goto out;
+ }
+
free_extent_map(em);
read_lock(&em_tree->lock);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 46e8426adf4f..85f359e0e0a7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -745,14 +745,6 @@ again:
h->reloc_reserved = reloc_reserved;
}
- /*
- * Now that we have found a transaction to be a part of, convert the
- * qgroup reservation from prealloc to pertrans. A different transaction
- * can't race in and free our pertrans out from under us.
- */
- if (qgroup_reserved)
- btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
-
got_it:
if (!current->journal_info)
current->journal_info = h;
@@ -786,8 +778,15 @@ got_it:
* not just freed.
*/
btrfs_end_transaction(h);
- return ERR_PTR(ret);
+ goto reserve_fail;
}
+ /*
+ * Now that we have found a transaction to be a part of, convert the
+ * qgroup reservation from prealloc to pertrans. A different transaction
+ * can't race in and free our pertrans out from under us.
+ */
+ if (qgroup_reserved)
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
return h;
@@ -1495,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
+ btrfs_qgroup_free_meta_all_pertrans(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
@@ -1519,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
if (ret2)
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c8fbcae4e88e..32604e9b31c3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1797,6 +1797,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_LEVEL;
}
+ if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -1858,6 +1863,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
+ enum btrfs_tree_block_status ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1913,21 +1919,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
- /*
- * We only want to do this if WRITTEN is set, otherwise the leaf
- * may be in some intermediate state and won't appear valid.
- */
- if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) {
- enum btrfs_tree_block_status ret;
-
- /*
- * Check if the item size and content meet other
- * criteria
- */
- ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
- return ret;
- }
+ /* Check if the item size and content meet other criteria. */
+ ret = check_leaf_item(leaf, &key, slot, &prev_key);
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
+ return ret;
prev_key.objectid = key.objectid;
prev_key.type = key.type;
@@ -1957,6 +1952,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
int level = btrfs_header_level(node);
u64 bytenr;
+ if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(node, 0, "invalid flag for node, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 5c809b50b2d0..01669cfa6578 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -53,6 +53,7 @@ enum btrfs_tree_block_status {
BTRFS_TREE_BLOCK_INVALID_BLOCKPTR,
BTRFS_TREE_BLOCK_INVALID_ITEM,
BTRFS_TREE_BLOCK_INVALID_OWNER,
+ BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
};
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f15591f3e54f..ef6bd2f4251b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3455,6 +3455,7 @@ again:
* alignment and size).
*/
ret = -EUCLEAN;
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 1d685357e67f..e667dbcd20e8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/bio.h>
#include <linux/falloc.h>
#include <linux/sched/mm.h>
#include <trace/events/fscache.h>
@@ -493,7 +494,7 @@ out_no_object:
* boundary as appropriate.
*/
static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
+ unsigned long long i_size)
{
return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
subreq->start, &subreq->len, i_size,
@@ -622,6 +623,77 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
return ret;
}
+static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
+
+ subreq->max_len = ULONG_MAX;
+ subreq->max_nr_segs = BIO_MAX_VECS;
+
+ if (!cachefiles_cres_file(cres)) {
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+ return netfs_prepare_write_failed(subreq);
+ if (!cachefiles_cres_file(cres))
+ return netfs_prepare_write_failed(subreq);
+ }
+}
+
+static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+ struct cachefiles_cache *cache = object->volume->cache;
+ const struct cred *saved_cred;
+ size_t off, pre, post, len = subreq->len;
+ loff_t start = subreq->start;
+ int ret;
+
+ _enter("W=%x[%x] %llx-%llx",
+ wreq->debug_id, subreq->debug_index, start, start + len - 1);
+
+ /* We need to start on the cache granularity boundary */
+ off = start & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+ if (off) {
+ pre = CACHEFILES_DIO_BLOCK_SIZE - off;
+ if (pre >= len) {
+ netfs_write_subrequest_terminated(subreq, len, false);
+ return;
+ }
+ subreq->transferred += pre;
+ start += pre;
+ len -= pre;
+ iov_iter_advance(&subreq->io_iter, pre);
+ }
+
+ /* We also need to end on the cache granularity boundary */
+ post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+ if (post) {
+ len -= post;
+ if (len == 0) {
+ netfs_write_subrequest_terminated(subreq, post, false);
+ return;
+ }
+ iov_iter_truncate(&subreq->io_iter, len);
+ }
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+ &start, &len, len, true);
+ cachefiles_end_secure(cache, saved_cred);
+ if (ret < 0) {
+ netfs_write_subrequest_terminated(subreq, ret, false);
+ return;
+ }
+
+ cachefiles_write(&subreq->rreq->cache_resources,
+ subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
/*
* Clean up an operation.
*/
@@ -638,8 +710,10 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.end_operation = cachefiles_end_operation,
.read = cachefiles_read,
.write = cachefiles_write,
+ .issue_write = cachefiles_issue_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .prepare_write_subreq = cachefiles_prepare_write_subreq,
.prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1340d77124ae..8c16bc5250ef 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -193,7 +193,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
* block, but do not exceed the file size, unless the original
* request already exceeds it.
*/
- new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
+ new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
if (new_end > end && new_end <= rreq->start + max_len)
rreq->len = new_end - rreq->start;
@@ -498,11 +498,6 @@ const struct netfs_request_ops ceph_netfs_ops = {
};
#ifdef CONFIG_CEPH_FSCACHE
-static void ceph_set_page_fscache(struct page *page)
-{
- set_page_fscache(page);
-}
-
static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
struct inode *inode = priv;
@@ -517,13 +512,9 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b
struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
- ceph_fscache_write_terminated, inode, caching);
+ ceph_fscache_write_terminated, inode, true, caching);
}
#else
-static inline void ceph_set_page_fscache(struct page *page)
-{
-}
-
static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
{
}
@@ -715,8 +706,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = wlen;
set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
@@ -795,10 +784,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
ihold(inode);
if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_fs_client(inode)->write_congested)
+ ceph_inode_to_fs_client(inode)->write_congested) {
+ redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
-
- wait_on_page_fscache(page);
+ }
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
@@ -1073,7 +1062,7 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page) || PageFsCache(page)) {
+ if (PageWriteback(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
doutc(cl, "%p under writeback\n", page);
unlock_page(page);
@@ -1081,7 +1070,6 @@ get_more_pages:
}
doutc(cl, "waiting on writeback %p\n", page);
wait_on_page_writeback(page);
- wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
@@ -1266,8 +1254,6 @@ new_request:
}
set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
len += thp_size(page);
}
ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1511,7 +1497,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
if (r < 0)
return r;
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
WARN_ON_ONCE(!folio_test_locked(folio));
*pagep = &folio->page;
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55051ad09c19..c4941ba245ac 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4783,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
ceph_vinop(inode));
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
list_del_init(&ci->i_cap_delay_list);
list_add_tail(&ci->i_cap_delay_list,
&mdsc->cap_unlink_delay_list);
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
/*
* Fire the work immediately, because the MDS maybe
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7b2e77517f23..99561fddcb38 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,6 +577,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */
netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab9c268a8bb..360b686c3c67 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work)
struct ceph_client *cl = mdsc->fsc->client;
doutc(cl, "begin\n");
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
while (!list_empty(&mdsc->cap_unlink_delay_list)) {
struct ceph_inode_info *ci;
struct inode *inode;
@@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work)
inode = igrab(&ci->netfs.inode);
if (inode) {
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "on %p %llx.%llx\n", inode,
ceph_vinop(inode));
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
}
}
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "done\n");
}
@@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
- spin_lock_init(&mdsc->cap_unlink_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 03f8ff00874f..b88e80415224 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -461,9 +461,8 @@ struct ceph_mds_client {
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
- spinlock_t cap_delay_lock; /* protects cap_delay_list */
struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
- spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */
+ spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
diff --git a/fs/dcache.c b/fs/dcache.c
index 71a8e943a0fa..407095188f83 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -355,7 +355,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
flags &= ~DCACHE_ENTRY_TYPE;
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
- if (dentry->d_flags & DCACHE_LRU_LIST)
+ if (flags & DCACHE_LRU_LIST)
this_cpu_inc(nr_dentry_negative);
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a40da0065433..dc51df0b118d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -14,7 +14,8 @@
#include <linux/module.h>
#include <linux/fs.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/kobject.h>
@@ -23,7 +24,6 @@
#include <linux/fsnotify.h>
#include <linux/string.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/security.h>
@@ -77,7 +77,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
return inode;
}
-struct debugfs_mount_opts {
+struct debugfs_fs_info {
kuid_t uid;
kgid_t gid;
umode_t mode;
@@ -89,68 +89,51 @@ enum {
Opt_uid,
Opt_gid,
Opt_mode,
- Opt_err
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec debugfs_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
};
-struct debugfs_fs_info {
- struct debugfs_mount_opts mount_opts;
-};
-
-static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
+static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- substring_t args[MAX_OPT_ARGS];
- int option;
- int token;
+ struct debugfs_fs_info *opts = fc->s_fs_info;
+ struct fs_parse_result result;
kuid_t uid;
kgid_t gid;
- char *p;
-
- opts->opts = 0;
- opts->mode = DEBUGFS_DEFAULT_MODE;
-
- while ((p = strsep(&data, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return -EINVAL;
- opts->uid = uid;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return -EINVAL;
- opts->gid = gid;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
- /*
- * We might like to report bad mount options here;
- * but traditionally debugfs has ignored all mount options
- */
- }
-
- opts->opts |= BIT(token);
+ int opt;
+
+ opt = fs_parse(fc, debugfs_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return invalf(fc, "Unknown uid");
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return invalf(fc, "Unknown gid");
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally debugfs has ignored all mount options
+ */
}
+ opts->opts |= BIT(opt);
+
return 0;
}
@@ -158,23 +141,22 @@ static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
- struct debugfs_mount_opts *opts = &fsi->mount_opts;
/*
* On remount, only reset mode/uid/gid if they were provided as mount
* options.
*/
- if (!remount || opts->opts & BIT(Opt_mode)) {
+ if (!remount || fsi->opts & BIT(Opt_mode)) {
inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ inode->i_mode |= fsi->mode;
}
- if (!remount || opts->opts & BIT(Opt_uid))
- inode->i_uid = opts->uid;
+ if (!remount || fsi->opts & BIT(Opt_uid))
+ inode->i_uid = fsi->uid;
- if (!remount || opts->opts & BIT(Opt_gid))
- inode->i_gid = opts->gid;
+ if (!remount || fsi->opts & BIT(Opt_gid))
+ inode->i_gid = fsi->gid;
}
static void debugfs_apply_options(struct super_block *sb)
@@ -187,35 +169,33 @@ static void debugfs_apply_options_remount(struct super_block *sb)
_debugfs_apply_options(sb, true);
}
-static int debugfs_remount(struct super_block *sb, int *flags, char *data)
+static int debugfs_reconfigure(struct fs_context *fc)
{
- int err;
- struct debugfs_fs_info *fsi = sb->s_fs_info;
+ struct super_block *sb = fc->root->d_sb;
+ struct debugfs_fs_info *sb_opts = sb->s_fs_info;
+ struct debugfs_fs_info *new_opts = fc->s_fs_info;
sync_filesystem(sb);
- err = debugfs_parse_options(data, &fsi->mount_opts);
- if (err)
- goto fail;
+ /* structure copy of new mount options to sb */
+ *sb_opts = *new_opts;
debugfs_apply_options_remount(sb);
-fail:
- return err;
+ return 0;
}
static int debugfs_show_options(struct seq_file *m, struct dentry *root)
{
struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;
- struct debugfs_mount_opts *opts = &fsi->mount_opts;
- if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID))
seq_printf(m, ",uid=%u",
- from_kuid_munged(&init_user_ns, opts->uid));
- if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ from_kuid_munged(&init_user_ns, fsi->uid));
+ if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
- from_kgid_munged(&init_user_ns, opts->gid));
- if (opts->mode != DEBUGFS_DEFAULT_MODE)
- seq_printf(m, ",mode=%o", opts->mode);
+ from_kgid_munged(&init_user_ns, fsi->gid));
+ if (fsi->mode != DEBUGFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", fsi->mode);
return 0;
}
@@ -229,7 +209,6 @@ static void debugfs_free_inode(struct inode *inode)
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
- .remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
.free_inode = debugfs_free_inode,
};
@@ -263,26 +242,14 @@ static const struct dentry_operations debugfs_dops = {
.d_automount = debugfs_automount,
};
-static int debug_fill_super(struct super_block *sb, void *data, int silent)
+static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr debug_files[] = {{""}};
- struct debugfs_fs_info *fsi;
int err;
- fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
- sb->s_fs_info = fsi;
- if (!fsi) {
- err = -ENOMEM;
- goto fail;
- }
-
- err = debugfs_parse_options(data, &fsi->mount_opts);
+ err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
if (err)
- goto fail;
-
- err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
- if (err)
- goto fail;
+ return err;
sb->s_op = &debugfs_super_operations;
sb->s_d_op = &debugfs_dops;
@@ -290,27 +257,48 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
debugfs_apply_options(sb);
return 0;
-
-fail:
- kfree(fsi);
- sb->s_fs_info = NULL;
- return err;
}
-static struct dentry *debug_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static int debugfs_get_tree(struct fs_context *fc)
{
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
- return ERR_PTR(-EPERM);
+ return -EPERM;
+
+ return get_tree_single(fc, debugfs_fill_super);
+}
+
+static void debugfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
- return mount_single(fs_type, flags, data, debug_fill_super);
+static const struct fs_context_operations debugfs_context_ops = {
+ .free = debugfs_free_fc,
+ .parse_param = debugfs_parse_param,
+ .get_tree = debugfs_get_tree,
+ .reconfigure = debugfs_reconfigure,
+};
+
+static int debugfs_init_fs_context(struct fs_context *fc)
+{
+ struct debugfs_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ fsi->mode = DEBUGFS_DEFAULT_MODE;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &debugfs_context_ops;
+ return 0;
}
static struct file_system_type debug_fs_type = {
.owner = THIS_MODULE,
.name = "debugfs",
- .mount = debug_mount,
+ .init_fs_context = debugfs_init_fs_context,
+ .parameters = debugfs_param_specs,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("debugfs");
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 62c97ff9e852..b0aafe640fa4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1217,7 +1217,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
inode_dio_begin(inode);
- retval = 0;
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 3fe41964c0d8..7f9f68c00ef6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -300,9 +300,11 @@ write_tag_66_packet(char *signature, u8 cipher_code,
* | Key Identifier Size | 1 or 2 bytes |
* | Key Identifier | arbitrary |
* | File Encryption Key Size | 1 or 2 bytes |
+ * | Cipher Code | 1 byte |
* | File Encryption Key | arbitrary |
+ * | Checksum | 2 bytes |
*/
- data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size);
+ data_len = (8 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size);
*packet = kmalloc(data_len, GFP_KERNEL);
message = *packet;
if (!message) {
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 8aff1a724805..62da538d91cb 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -151,7 +151,7 @@ static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
if (WARN_ON(len == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(NULL, "prepare_read failed (source %d)", source);
+ erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source);
return -EIO;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 39c67119f43b..d28ccfc0352b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -84,13 +84,6 @@ struct erofs_dev_context {
bool flatdev;
};
-struct erofs_fs_context {
- struct erofs_mount_opts opt;
- struct erofs_dev_context *devs;
- char *fsid;
- char *domain_id;
-};
-
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c0eb139adb07..30b49b2eee53 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -370,18 +370,18 @@ out:
return ret;
}
-static void erofs_default_options(struct erofs_fs_context *ctx)
+static void erofs_default_options(struct erofs_sb_info *sbi)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->opt.max_sync_decompress_pages = 3;
- ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
+ sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ sbi->opt.max_sync_decompress_pages = 3;
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
#endif
}
@@ -426,16 +426,16 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
{
#ifdef CONFIG_FS_DAX
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
- set_opt(&ctx->opt, DAX_ALWAYS);
- clear_opt(&ctx->opt, DAX_NEVER);
+ set_opt(&sbi->opt, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(&ctx->opt, DAX_NEVER);
- clear_opt(&ctx->opt, DAX_ALWAYS);
+ set_opt(&sbi->opt, DAX_NEVER);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -450,7 +450,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
struct fs_parse_result result;
struct erofs_device_info *dif;
int opt, ret;
@@ -463,9 +463,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
else
- clear_opt(&ctx->opt, XATTR_USER);
+ clear_opt(&sbi->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -473,16 +473,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
else
- clear_opt(&ctx->opt, POSIX_ACL);
+ clear_opt(&sbi->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = result.uint_32;
+ sbi->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -504,27 +504,27 @@ static int erofs_fc_parse_param(struct fs_context *fc,
kfree(dif);
return -ENOMEM;
}
- down_write(&ctx->devs->rwsem);
- ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
- up_write(&ctx->devs->rwsem);
+ down_write(&sbi->devs->rwsem);
+ ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&sbi->devs->rwsem);
if (ret < 0) {
kfree(dif->path);
kfree(dif);
return ret;
}
- ++ctx->devs->extra_devices;
+ ++sbi->devs->extra_devices;
break;
#ifdef CONFIG_EROFS_FS_ONDEMAND
case Opt_fsid:
- kfree(ctx->fsid);
- ctx->fsid = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->fsid)
+ kfree(sbi->fsid);
+ sbi->fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->fsid)
return -ENOMEM;
break;
case Opt_domain_id:
- kfree(ctx->domain_id);
- ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->domain_id)
+ kfree(sbi->domain_id);
+ sbi->domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->domain_id)
return -ENOMEM;
break;
#else
@@ -581,8 +581,7 @@ static const struct export_operations erofs_export_ops = {
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
- struct erofs_sb_info *sbi;
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
@@ -590,19 +589,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &erofs_sops;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sb->s_fs_info = sbi;
- sbi->opt = ctx->opt;
- sbi->devs = ctx->devs;
- ctx->devs = NULL;
- sbi->fsid = ctx->fsid;
- ctx->fsid = NULL;
- sbi->domain_id = ctx->domain_id;
- ctx->domain_id = NULL;
-
sbi->blkszbits = PAGE_SHIFT;
if (erofs_is_fscache_mode(sb)) {
sb->s_blocksize = PAGE_SIZE;
@@ -706,9 +692,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
static int erofs_fc_get_tree(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
return get_tree_bdev(fc, erofs_fc_fill_super);
@@ -718,19 +704,19 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *new_sbi = fc->s_fs_info;
DBG_BUGON(!sb_rdonly(sb));
- if (ctx->fsid || ctx->domain_id)
+ if (new_sbi->fsid || new_sbi->domain_id)
erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
- if (test_opt(&ctx->opt, POSIX_ACL))
+ if (test_opt(&new_sbi->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->opt = ctx->opt;
+ sbi->opt = new_sbi->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
@@ -761,12 +747,15 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
static void erofs_fc_free(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
- erofs_free_dev_context(ctx->devs);
- kfree(ctx->fsid);
- kfree(ctx->domain_id);
- kfree(ctx);
+ if (!sbi)
+ return;
+
+ erofs_free_dev_context(sbi->devs);
+ kfree(sbi->fsid);
+ kfree(sbi->domain_id);
+ kfree(sbi);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -778,38 +767,35 @@ static const struct fs_context_operations erofs_context_ops = {
static int erofs_init_fs_context(struct fs_context *fc)
{
- struct erofs_fs_context *ctx;
+ struct erofs_sb_info *sbi;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
return -ENOMEM;
- ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
- if (!ctx->devs) {
- kfree(ctx);
+
+ sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!sbi->devs) {
+ kfree(sbi);
return -ENOMEM;
}
- fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
- idr_init(&ctx->devs->tree);
- init_rwsem(&ctx->devs->rwsem);
- erofs_default_options(ctx);
+ idr_init(&sbi->devs->tree);
+ init_rwsem(&sbi->devs->rwsem);
+ erofs_default_options(sbi);
fc->ops = &erofs_context_ops;
return 0;
}
static void erofs_kill_sb(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fscache_mode(sb))
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
kill_anon_super(sb);
else
kill_block_super(sb);
- sbi = EROFS_SB(sb);
- if (!sbi)
- return;
-
erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 882b89edc52a..f53ca4f7fced 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -980,6 +980,34 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
}
/*
+ * The ffd.file pointer may be in the process of being torn down due to
+ * being closed, but we may not have finished eventpoll_release() yet.
+ *
+ * Normally, even with the atomic_long_inc_not_zero, the file may have
+ * been free'd and then gotten re-allocated to something else (since
+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ *
+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
+ * the process of being free'd will block in eventpoll_release_file()
+ * and thus the underlying file allocation will not be free'd, and the
+ * file re-use cannot happen.
+ *
+ * For the same reason we can avoid a rcu_read_lock() around the
+ * operation - 'ffd.file' cannot go away even if the refcount has
+ * reached zero (but we must still not call out to ->poll() functions
+ * etc).
+ */
+static struct file *epi_fget(const struct epitem *epi)
+{
+ struct file *file;
+
+ file = epi->ffd.file;
+ if (!atomic_long_inc_not_zero(&file->f_count))
+ file = NULL;
+ return file;
+}
+
+/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
@@ -987,14 +1015,22 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
- struct file *file = epi->ffd.file;
+ struct file *file = epi_fget(epi);
__poll_t res;
+ /*
+ * We could return EPOLLERR | EPOLLHUP or something, but let's
+ * treat this more as "file doesn't exist, poll didn't happen".
+ */
+ if (!file)
+ return 0;
+
pt->_key = epi->event.events;
if (!is_file_epoll(file))
res = vfs_poll(file, pt);
else
res = __ep_eventpoll_poll(file, pt, depth);
+ fput(file);
return res & epi->event.events;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 077944d3c2c0..84572e11cc05 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -420,6 +420,7 @@ static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type)
static void exfat_init_stream_entry(struct exfat_dentry *ep,
unsigned int start_clu, unsigned long long size)
{
+ memset(ep, 0, sizeof(*ep));
exfat_set_entry_type(ep, TYPE_STREAM);
if (size == 0)
ep->dentry.stream.flags = ALLOC_FAT_CHAIN;
@@ -457,6 +458,7 @@ void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
struct exfat_dentry *ep;
ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
+ memset(ep, 0, sizeof(*ep));
exfat_set_entry_type(ep, type);
exfat_set_entry_time(sbi, ts,
&ep->dentry.file.create_tz,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index cc00f1a7a1e1..9adfc38ca7da 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -51,7 +51,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
clu.flags = ei->flags;
ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters,
- &clu, IS_DIRSYNC(inode));
+ &clu, inode_needs_sync(inode));
if (ret)
return ret;
@@ -77,12 +77,11 @@ out:
ei->i_size_aligned = round_up(size, sb->s_blocksize);
ei->i_size_ondisk = ei->i_size_aligned;
inode->i_blocks = round_up(size, sbi->cluster_size) >> 9;
+ mark_inode_dirty(inode);
- if (IS_DIRSYNC(inode))
+ if (IS_SYNC(inode))
return write_inode_now(inode, 1);
- mark_inode_dirty(inode);
-
return 0;
free_clu:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 54d6ff22585c..28c51b0cc4db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -885,8 +885,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
- FMODE_DIO_PARALLEL_WRITE;
+ filp->f_mode |= FMODE_NOWAIT;
return dquot_file_open(inode, filp);
}
@@ -938,7 +937,6 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -946,6 +944,8 @@ const struct file_operations ext4_file_operations = {
.splice_read = ext4_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_DIO_PARALLEL_WRITE,
};
const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 044135796f2b..3fce1b80c419 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1723,10 +1723,6 @@ static const struct constant_table ext4_param_dax[] = {
{}
};
-/* String parameter that allows empty argument */
-#define fsparam_string_empty(NAME, OPT) \
- __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
/*
* Mount option specification
* We don't use fsparam_flag_no because of the way we set the
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1761ad125f97..2b65e09822d4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -569,7 +569,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ filp->f_mode |= FMODE_NOWAIT;
filp->f_mode |= FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
@@ -5045,4 +5045,5 @@ const struct file_operations f2fs_file_operations = {
.splice_read = f2fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fadvise = f2fs_file_fadvise,
+ .fop_flags = FOP_BUFFER_RASYNC,
};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 54cc85d3338e..300e5d9ad913 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -327,6 +327,22 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
return 0;
}
+/* Is the file descriptor a dup of the file? */
+static long f_dupfd_query(int fd, struct file *filp)
+{
+ CLASS(fd_raw, f)(fd);
+
+ /*
+ * We can do the 'fdput()' immediately, as the only thing that
+ * matters is the pointer value which isn't changed by the fdput.
+ *
+ * Technically we didn't need a ref at all, and 'fdget()' was
+ * overkill, but given our lockless file pointer lookup, the
+ * alternatives are complicated.
+ */
+ return f.file == filp;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -342,6 +358,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_DUPFD_CLOEXEC:
err = f_dupfd(argi, filp, O_CLOEXEC);
break;
+ case F_DUPFD_QUERY:
+ err = f_dupfd_query(argi, filp);
+ break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
break;
@@ -446,6 +465,7 @@ static int check_fcntl_cmd(unsigned cmd)
switch (cmd) {
case F_DUPFD:
case F_DUPFD_CLOEXEC:
+ case F_DUPFD_QUERY:
case F_GETFD:
case F_SETFD:
case F_GETFL:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 57a12614addf..8a7f86c2139a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (f_handle.handle_bytes > MAX_HANDLE_SZ)
return -EINVAL;
- handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+ handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle)
return -ENOMEM;
@@ -71,7 +71,7 @@ static long do_sys_name_to_handle(const struct path *path,
/* copy the mount id */
if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
copy_to_user(ufh, handle,
- sizeof(struct file_handle) + handle_bytes))
+ struct_size(handle, f_handle, handle_bytes)))
retval = -EFAULT;
kfree(handle);
return retval;
@@ -192,7 +192,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
retval = -EINVAL;
goto out_err;
}
- handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+ handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
retval = -ENOMEM;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 42e03b6b1cc7..fabe60778658 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -17,7 +17,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/vfs.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
#include "vxfs.h"
#include "vxfs_extern.h"
@@ -91,10 +91,10 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
return 0;
}
-static int vxfs_remount(struct super_block *sb, int *flags, char *data)
+static int vxfs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- *flags |= SB_RDONLY;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
@@ -120,24 +120,24 @@ static const struct super_operations vxfs_super_ops = {
.evict_inode = vxfs_evict_inode,
.put_super = vxfs_put_super,
.statfs = vxfs_statfs,
- .remount_fs = vxfs_remount,
};
-static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
+static int vxfs_try_sb_magic(struct super_block *sbp, struct fs_context *fc,
unsigned blk, __fs32 magic)
{
struct buffer_head *bp;
struct vxfs_sb *rsbp;
struct vxfs_sb_info *infp = VXFS_SBI(sbp);
+ int silent = fc->sb_flags & SB_SILENT;
int rc = -ENOMEM;
bp = sb_bread(sbp, blk);
do {
if (!bp || !buffer_mapped(bp)) {
if (!silent) {
- printk(KERN_WARNING
- "vxfs: unable to read disk superblock at %u\n",
- blk);
+ warnf(fc,
+ "vxfs: unable to read disk superblock at %u",
+ blk);
}
break;
}
@@ -146,9 +146,9 @@ static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
rsbp = (struct vxfs_sb *)bp->b_data;
if (rsbp->vs_magic != magic) {
if (!silent)
- printk(KERN_NOTICE
- "vxfs: WRONG superblock magic %08x at %u\n",
- rsbp->vs_magic, blk);
+ infof(fc,
+ "vxfs: WRONG superblock magic %08x at %u",
+ rsbp->vs_magic, blk);
break;
}
@@ -169,8 +169,7 @@ static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
/**
* vxfs_fill_super - read superblock into memory and initialize filesystem
* @sbp: VFS superblock (to fill)
- * @dp: fs private mount data
- * @silent: do not complain loudly when sth is wrong
+ * @fc: filesytem context
*
* Description:
* We are called on the first mount of a filesystem to read the
@@ -182,26 +181,27 @@ static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
* Locking:
* We are under @sbp->s_lock.
*/
-static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
+static int vxfs_fill_super(struct super_block *sbp, struct fs_context *fc)
{
struct vxfs_sb_info *infp;
struct vxfs_sb *rsbp;
u_long bsize;
struct inode *root;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
u32 j;
sbp->s_flags |= SB_RDONLY;
infp = kzalloc(sizeof(*infp), GFP_KERNEL);
if (!infp) {
- printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
+ warnf(fc, "vxfs: unable to allocate incore superblock");
return -ENOMEM;
}
bsize = sb_min_blocksize(sbp, BLOCK_SIZE);
if (!bsize) {
- printk(KERN_WARNING "vxfs: unable to set blocksize\n");
+ warnf(fc, "vxfs: unable to set blocksize");
goto out;
}
@@ -210,24 +210,24 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
sbp->s_time_min = 0;
sbp->s_time_max = U32_MAX;
- if (!vxfs_try_sb_magic(sbp, silent, 1,
+ if (!vxfs_try_sb_magic(sbp, fc, 1,
(__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) {
/* Unixware, x86 */
infp->byte_order = VXFS_BO_LE;
- } else if (!vxfs_try_sb_magic(sbp, silent, 8,
+ } else if (!vxfs_try_sb_magic(sbp, fc, 8,
(__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) {
/* HP-UX, parisc */
infp->byte_order = VXFS_BO_BE;
} else {
if (!silent)
- printk(KERN_NOTICE "vxfs: can't find superblock.\n");
+ infof(fc, "vxfs: can't find superblock.");
goto out;
}
rsbp = infp->vsi_raw;
j = fs32_to_cpu(infp, rsbp->vs_version);
if ((j < 2 || j > 4) && !silent) {
- printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j);
+ infof(fc, "vxfs: unsupported VxFS version (%d)", j);
goto out;
}
@@ -244,17 +244,17 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
j = fs32_to_cpu(infp, rsbp->vs_bsize);
if (!sb_set_blocksize(sbp, j)) {
- printk(KERN_WARNING "vxfs: unable to set final block size\n");
+ warnf(fc, "vxfs: unable to set final block size");
goto out;
}
if (vxfs_read_olt(sbp, bsize)) {
- printk(KERN_WARNING "vxfs: unable to read olt\n");
+ warnf(fc, "vxfs: unable to read olt");
goto out;
}
if (vxfs_read_fshead(sbp)) {
- printk(KERN_WARNING "vxfs: unable to read fshead\n");
+ warnf(fc, "vxfs: unable to read fshead");
goto out;
}
@@ -265,7 +265,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
}
sbp->s_root = d_make_root(root);
if (!sbp->s_root) {
- printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
+ warnf(fc, "vxfs: unable to get root dentry.");
goto out_free_ilist;
}
@@ -284,18 +284,29 @@ out:
/*
* The usual module blurb.
*/
-static struct dentry *vxfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int vxfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+ return get_tree_bdev(fc, vxfs_fill_super);
+}
+
+static const struct fs_context_operations vxfs_context_ops = {
+ .get_tree = vxfs_get_tree,
+ .reconfigure = vxfs_reconfigure,
+};
+
+static int vxfs_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &vxfs_context_ops;
+
+ return 0;
}
static struct file_system_type vxfs_fs_type = {
.owner = THIS_MODULE,
.name = "vxfs",
- .mount = vxfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = vxfs_init_fs_context,
};
MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */
MODULE_ALIAS("vxfs");
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e4f17c53ddfc..92a5b8283528 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -166,8 +166,7 @@ static void wb_wakeup_delayed(struct bdi_writeback *wb)
spin_unlock_irq(&wb->work_lock);
}
-static void finish_writeback_work(struct bdi_writeback *wb,
- struct wb_writeback_work *work)
+static void finish_writeback_work(struct wb_writeback_work *work)
{
struct wb_completion *done = work->done;
@@ -196,7 +195,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
list_add_tail(&work->list, &wb->work_list);
mod_delayed_work(bdi_wq, &wb->dwork, 0);
} else
- finish_writeback_work(wb, work);
+ finish_writeback_work(work);
spin_unlock_irq(&wb->work_lock);
}
@@ -1561,7 +1560,8 @@ static void inode_sleep_on_writeback(struct inode *inode)
* thread's back can have unexpected consequences.
*/
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ unsigned long dirtied_before)
{
if (inode->i_state & I_FREEING)
return;
@@ -1594,7 +1594,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything.
*/
- if (wbc->nr_to_write <= 0) {
+ if (wbc->nr_to_write <= 0 &&
+ !inode_dirtied_after(inode, dirtied_before)) {
/* Slice used up. Queue for next turn. */
requeue_io(inode, wb);
} else {
@@ -1862,6 +1863,11 @@ static long writeback_sb_inodes(struct super_block *sb,
unsigned long start_time = jiffies;
long write_chunk;
long total_wrote = 0; /* count both pages and inodes */
+ unsigned long dirtied_before = jiffies;
+
+ if (work->for_kupdate)
+ dirtied_before = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
@@ -1967,7 +1973,7 @@ static long writeback_sb_inodes(struct super_block *sb,
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
total_wrote++;
- requeue_inode(inode, tmp_wb, &wbc);
+ requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -2069,6 +2075,7 @@ static long wb_writeback(struct bdi_writeback *wb,
struct inode *inode;
long progress;
struct blk_plug plug;
+ bool queued = false;
blk_start_plug(&plug);
for (;;) {
@@ -2098,21 +2105,24 @@ static long wb_writeback(struct bdi_writeback *wb,
spin_lock(&wb->list_lock);
- /*
- * Kupdate and background works are special and we want to
- * include all inodes that need writing. Livelock avoidance is
- * handled by these works yielding to any other work so we are
- * safe.
- */
- if (work->for_kupdate) {
- dirtied_before = jiffies -
- msecs_to_jiffies(dirty_expire_interval * 10);
- } else if (work->for_background)
- dirtied_before = jiffies;
-
trace_writeback_start(wb, work);
- if (list_empty(&wb->b_io))
+ if (list_empty(&wb->b_io)) {
+ /*
+ * Kupdate and background works are special and we want
+ * to include all inodes that need writing. Livelock
+ * avoidance is handled by these works yielding to any
+ * other work so we are safe.
+ */
+ if (work->for_kupdate) {
+ dirtied_before = jiffies -
+ msecs_to_jiffies(dirty_expire_interval *
+ 10);
+ } else if (work->for_background)
+ dirtied_before = jiffies;
+
queue_io(wb, work, dirtied_before);
+ queued = true;
+ }
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
@@ -2127,7 +2137,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
- if (progress) {
+ if (progress || !queued) {
spin_unlock(&wb->list_lock);
continue;
}
@@ -2262,7 +2272,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
while ((work = get_next_work_item(wb)) != NULL) {
trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- finish_writeback_work(wb, work);
+ finish_writeback_work(work);
}
/*
@@ -2322,8 +2332,7 @@ void wb_workfn(struct work_struct *work)
}
/*
- * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
- * write back the whole world.
+ * Start writeback of all dirty pages on this bdi.
*/
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
enum wb_reason reason)
@@ -2726,7 +2735,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
*/
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
- return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
+ writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cad106c37e..0b2da7b7e2ad 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -310,6 +310,10 @@ struct cuse_init_args {
/**
* cuse_process_init_reply - finish initializing CUSE channel
*
+ * @fm: The fuse mount information containing the CUSE connection.
+ * @args: The arguments passed to the init reply.
+ * @error: The error code signifying if any error occurred during the process.
+ *
* This function creates the character device and sets up all the
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4a6df591add6..2b0d4781f394 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1321,6 +1321,7 @@ retry:
err = fuse_do_statx(inode, file, stat);
if (err == -ENOSYS) {
fc->no_statx = 1;
+ err = 0;
goto retry;
}
} else {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a56e7bffd000..b57ce4157640 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1362,7 +1362,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
bool *exclusive)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct fuse_file *ff = iocb->ki_filp->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
*exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
if (*exclusive) {
@@ -1377,7 +1377,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
* have raced, so check it again.
*/
if (fuse_io_past_eof(iocb, from) ||
- fuse_file_uncached_io_start(inode, ff, NULL) != 0) {
+ fuse_inode_uncached_io_start(fi, NULL) != 0) {
inode_unlock_shared(inode);
inode_lock(inode);
*exclusive = true;
@@ -1388,13 +1388,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct fuse_file *ff = iocb->ki_filp->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
if (exclusive) {
inode_unlock(inode);
} else {
/* Allow opens in caching mode after last parallel dio end */
- fuse_file_uncached_io_end(inode, ff);
+ fuse_inode_uncached_io_end(fi);
inode_unlock_shared(inode);
}
}
@@ -2574,8 +2574,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
* First mmap of direct_io file enters caching inode io mode.
* Also waits for parallel dio writers to go into serial mode
* (exclusive instead of shared lock).
+ * After first mmap, the inode stays in caching io mode until
+ * the direct_io file release.
*/
- rc = fuse_file_cached_io_start(inode, ff);
+ rc = fuse_file_cached_io_open(inode, ff);
if (rc)
return rc;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b24084b60864..f23919610313 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1394,9 +1394,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
/* iomode.c */
-int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff);
-int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb);
-void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff);
+int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
+int fuse_inode_uncached_io_start(struct fuse_inode *fi,
+ struct fuse_backing *fb);
+void fuse_inode_uncached_io_end(struct fuse_inode *fi);
int fuse_file_io_open(struct file *file, struct inode *inode);
void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3a5d88878335..99e44ea7d875 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -175,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode)
}
}
if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
+ WARN_ON(fi->iocachectr != 0);
WARN_ON(!list_empty(&fi->write_files));
WARN_ON(!list_empty(&fi->queued_writes));
}
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c653ddcf0578..c99e285f3183 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi)
}
/*
- * Start cached io mode.
+ * Called on cached file open() and on first mmap() of direct_io file.
+ * Takes cached_io inode mode reference to be dropped on file release.
*
* Blocks new parallel dio writes and waits for the in-progress parallel dio
* writes to complete.
*/
-int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
+int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff)
{
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
return 0;
}
-static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
+static void fuse_file_cached_io_release(struct fuse_file *ff,
+ struct fuse_inode *fi)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
spin_lock(&fi->lock);
WARN_ON(fi->iocachectr <= 0);
WARN_ON(ff->iomode != IOM_CACHED);
@@ -82,16 +82,15 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
}
/* Start strictly uncached io mode where cache access is not allowed */
-int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb)
+int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_backing *oldfb;
int err = 0;
spin_lock(&fi->lock);
/* deny conflicting backing files on same fuse inode */
oldfb = fuse_inode_backing(fi);
- if (oldfb && oldfb != fb) {
+ if (fb && oldfb && oldfb != fb) {
err = -EBUSY;
goto unlock;
}
@@ -99,12 +98,10 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc
err = -ETXTBSY;
goto unlock;
}
- WARN_ON(ff->iomode != IOM_NONE);
fi->iocachectr--;
- ff->iomode = IOM_UNCACHED;
/* fuse inode holds a single refcount of backing file */
- if (!oldfb) {
+ if (fb && !oldfb) {
oldfb = fuse_inode_backing_set(fi, fb);
WARN_ON_ONCE(oldfb != NULL);
} else {
@@ -115,15 +112,29 @@ unlock:
return err;
}
-void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
+/* Takes uncached_io inode mode reference to be dropped on file release */
+static int fuse_file_uncached_io_open(struct inode *inode,
+ struct fuse_file *ff,
+ struct fuse_backing *fb)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ int err;
+
+ err = fuse_inode_uncached_io_start(fi, fb);
+ if (err)
+ return err;
+
+ WARN_ON(ff->iomode != IOM_NONE);
+ ff->iomode = IOM_UNCACHED;
+ return 0;
+}
+
+void fuse_inode_uncached_io_end(struct fuse_inode *fi)
+{
struct fuse_backing *oldfb = NULL;
spin_lock(&fi->lock);
WARN_ON(fi->iocachectr >= 0);
- WARN_ON(ff->iomode != IOM_UNCACHED);
- ff->iomode = IOM_NONE;
fi->iocachectr++;
if (!fi->iocachectr) {
wake_up(&fi->direct_io_waitq);
@@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
fuse_backing_put(oldfb);
}
+/* Drop uncached_io reference from passthrough open */
+static void fuse_file_uncached_io_release(struct fuse_file *ff,
+ struct fuse_inode *fi)
+{
+ WARN_ON(ff->iomode != IOM_UNCACHED);
+ ff->iomode = IOM_NONE;
+ fuse_inode_uncached_io_end(fi);
+}
+
/*
* Open flags that are allowed in combination with FOPEN_PASSTHROUGH.
* A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write
@@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
return PTR_ERR(fb);
/* First passthrough file open denies caching inode io mode */
- err = fuse_file_uncached_io_start(inode, ff, fb);
+ err = fuse_file_uncached_io_open(inode, ff, fb);
if (!err)
return 0;
@@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
if (ff->open_flags & FOPEN_PASSTHROUGH)
err = fuse_file_passthrough_open(inode, file);
else
- err = fuse_file_cached_io_start(inode, ff);
+ err = fuse_file_cached_io_open(inode, ff);
if (err)
goto fail;
@@ -236,8 +256,10 @@ fail:
/* No more pending io and no new io possible to inode via open/mmapped file */
void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
/*
- * Last parallel dio close allows caching inode io mode.
+ * Last passthrough file close allows caching inode io mode.
* Last caching file close exits caching inode io mode.
*/
switch (ff->iomode) {
@@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
/* Nothing to do */
break;
case IOM_UNCACHED:
- fuse_file_uncached_io_end(inode, ff);
+ fuse_file_uncached_io_release(ff, fi);
break;
case IOM_CACHED:
- fuse_file_cached_io_end(inode, ff);
+ fuse_file_cached_io_release(ff, fi);
break;
}
}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 1567f0323858..9666d13884ce 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -225,7 +225,7 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
goto out;
res = -EINVAL;
- if (map->flags)
+ if (map->flags || map->padding)
goto out;
file = fget(map->fd);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 322af827a232..bb3e941b9503 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -170,7 +170,7 @@ static ssize_t tag_show(struct kobject *kobj,
{
struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
- return sysfs_emit(buf, fs->tag);
+ return sysfs_emit(buf, "%s\n", fs->tag);
}
static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6502c7e776d1..34ac73cc36b1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -40,7 +40,7 @@
#include <linux/sched/mm.h>
static const struct address_space_operations hugetlbfs_aops;
-const struct file_operations hugetlbfs_file_operations;
+static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
@@ -1301,13 +1301,14 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-const struct file_operations hugetlbfs_file_operations = {
+static const struct file_operations hugetlbfs_file_operations = {
.read_iter = hugetlbfs_read_iter,
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
.fallocate = hugetlbfs_fallocate,
+ .fop_flags = FOP_HUGE_PAGES,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d5abfdf0f22..fb0628e680c4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -769,7 +769,7 @@ static int ioctl_getfsuuid(struct file *file, void __user *argp)
struct fsuuid2 u = { .len = sb->s_uuid_len, };
if (!sb->s_uuid_len)
- return -ENOIOCTLCMD;
+ return -ENOTTY;
memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);
@@ -781,7 +781,7 @@ static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
struct super_block *sb = file_inode(file)->i_sb;
if (!strlen(sb->s_sysfs_name))
- return -ENOIOCTLCMD;
+ return -ENOTTY;
struct fs_sysfs_path u = {};
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 4e8e41c8b3c0..41c8f0c68ef5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -824,12 +824,11 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
out_unlock:
__iomap_put_folio(iter, pos, 0, folio);
- iomap_write_failed(iter->inode, pos, len);
return status;
}
-static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
flush_dcache_folio(folio);
@@ -846,14 +845,14 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
* redo the whole thing.
*/
if (unlikely(copied < len && !folio_test_uptodate(folio)))
- return 0;
+ return false;
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio);
- return copied;
+ return true;
}
-static size_t iomap_write_end_inline(const struct iomap_iter *iter,
+static void iomap_write_end_inline(const struct iomap_iter *iter,
struct folio *folio, loff_t pos, size_t copied)
{
const struct iomap *iomap = &iter->iomap;
@@ -868,42 +867,32 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
kunmap_local(addr);
mark_inode_dirty(iter->inode);
- return copied;
}
-/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
-static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
+/*
+ * Returns true if all copied bytes have been written to the pagecache,
+ * otherwise return false.
+ */
+static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t old_size = iter->inode->i_size;
- size_t ret;
if (srcmap->type == IOMAP_INLINE) {
- ret = iomap_write_end_inline(iter, folio, pos, copied);
- } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
- copied, &folio->page, NULL);
- } else {
- ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
+ iomap_write_end_inline(iter, folio, pos, copied);
+ return true;
}
- /*
- * Update the in-memory inode size after copying the data into the page
- * cache. It's up to the file system to write the updated size to disk,
- * preferably after I/O completion so that no stale data is exposed.
- */
- if (pos + ret > old_size) {
- i_size_write(iter->inode, pos + ret);
- iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+ if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+ size_t bh_written;
+
+ bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
+ len, copied, &folio->page, NULL);
+ WARN_ON_ONCE(bh_written != copied && bh_written != 0);
+ return bh_written == copied;
}
- __iomap_put_folio(iter, pos, ret, folio);
- if (old_size < pos)
- pagecache_isize_extended(iter->inode, old_size, pos);
- if (ret < len)
- iomap_write_failed(iter->inode, pos + ret, len - ret);
- return ret;
+ return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -911,16 +900,18 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t length = iomap_length(iter);
size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
- ssize_t written = 0;
+ ssize_t total_written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
struct folio *folio;
+ loff_t old_size;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
+ size_t written; /* Bytes have been written */
bytes = iov_iter_count(i);
retry:
@@ -950,8 +941,10 @@ retry:
}
status = iomap_write_begin(iter, pos, bytes, &folio);
- if (unlikely(status))
+ if (unlikely(status)) {
+ iomap_write_failed(iter->inode, pos, bytes);
break;
+ }
if (iter->iomap.flags & IOMAP_F_STALE)
break;
@@ -963,19 +956,37 @@ retry:
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- status = iomap_write_end(iter, pos, bytes, copied, folio);
+ written = iomap_write_end(iter, pos, bytes, copied, folio) ?
+ copied : 0;
+
+ /*
+ * Update the in-memory inode size after copying the data into
+ * the page cache. It's up to the file system to write the
+ * updated size to disk, preferably after I/O completion so that
+ * no stale data is exposed. Only once that's done can we
+ * unlock and release the folio.
+ */
+ old_size = iter->inode->i_size;
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+ }
+ __iomap_put_folio(iter, pos, written, folio);
- if (unlikely(copied != status))
- iov_iter_revert(i, copied - status);
+ if (old_size < pos)
+ pagecache_isize_extended(iter->inode, old_size, pos);
cond_resched();
- if (unlikely(status == 0)) {
+ if (unlikely(written == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
+ iomap_write_failed(iter->inode, pos, bytes);
+ iov_iter_revert(i, copied);
+
if (chunk > PAGE_SIZE)
chunk /= 2;
if (copied) {
@@ -983,17 +994,17 @@ retry:
goto retry;
}
} else {
- pos += status;
- written += status;
- length -= status;
+ pos += written;
+ total_written += written;
+ length -= written;
}
} while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
- iov_iter_revert(i, written);
+ iov_iter_revert(i, total_written);
return -EAGAIN;
}
- return written ? written : status;
+ return total_written ? total_written : status;
}
ssize_t
@@ -1322,6 +1333,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
+ bool ret;
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
@@ -1333,8 +1345,9 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;
- bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
- if (WARN_ON_ONCE(bytes == 0))
+ ret = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
+ if (WARN_ON_ONCE(!ret))
return -EIO;
cond_resched();
@@ -1383,6 +1396,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
+ bool ret;
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
@@ -1397,8 +1411,9 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
- bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
- if (WARN_ON_ONCE(bytes == 0))
+ ret = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
+ if (WARN_ON_ONCE(!ret))
return -EIO;
pos += bytes;
@@ -1958,18 +1973,13 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
return error;
}
-static int iomap_do_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
-{
- return iomap_writepage_map(data, wbc, folio);
-}
-
int
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops)
{
- int ret;
+ struct folio *folio = NULL;
+ int error;
/*
* Writeback from reclaim context should never happen except in the case
@@ -1980,8 +1990,9 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return -EIO;
wpc->ops = ops;
- ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
- return iomap_submit_ioend(wpc, ret);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = iomap_writepage_map(wpc, wbc, folio);
+ return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 00224f3a8d6e..defb4162c3d5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1110,6 +1110,9 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
return rc;
request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+ if (request > c->sector_size - c->cleanmarker_size)
+ return -ERANGE;
+
rc = jffs2_reserve_space(c, request, &length,
ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
if (rc) {
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e9df2f87072c..8502ef68459b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -636,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
* each file a separate locking class. Let's differentiate on
* whether the file has mmap or not for now.
*
- * Both paths of the branch look the same. They're supposed to
+ * For similar reasons, writable and readonly files are given different
+ * lockdep key, because the writable file /sys/power/resume may call vfs
+ * lookup helpers for arbitrary paths and readonly files can be read by
+ * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs.
+ *
+ * All three cases look the same. They're supposed to
* look that way and give @of->mutex different static lockdep keys.
*/
if (has_mmap)
mutex_init(&of->mutex);
+ else if (file->f_mode & FMODE_WRITE)
+ mutex_init(&of->mutex);
else
mutex_init(&of->mutex);
diff --git a/fs/libfs.c b/fs/libfs.c
index 3a6f2cb364f8..b635ee5adbcc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -295,6 +295,18 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
return 0;
}
+static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry,
+ long offset)
+{
+ int ret;
+
+ ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL);
+ if (ret)
+ return ret;
+ offset_set(dentry, offset);
+ return 0;
+}
+
/**
* simple_offset_remove - Remove an entry to a directory's offset map
* @octx: directory offset ctx to be updated
@@ -346,12 +358,45 @@ int simple_offset_empty(struct dentry *dentry)
}
/**
+ * simple_offset_rename - handle directory offsets for rename
+ * @old_dir: parent directory of source entry
+ * @old_dentry: dentry of source entry
+ * @new_dir: parent_directory of destination entry
+ * @new_dentry: dentry of destination
+ *
+ * Caller provides appropriate serialization.
+ *
+ * User space expects the directory offset value of the replaced
+ * (new) directory entry to be unchanged after a rename.
+ *
+ * Returns zero on success, a negative errno value on failure.
+ */
+int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+ struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+ long new_offset = dentry2offset(new_dentry);
+
+ simple_offset_remove(old_ctx, old_dentry);
+
+ if (new_offset) {
+ offset_set(new_dentry, 0);
+ return simple_offset_replace(new_ctx, old_dentry, new_offset);
+ }
+ return simple_offset_add(new_ctx, old_dentry);
+}
+
+/**
* simple_offset_rename_exchange - exchange rename with directory offsets
* @old_dir: parent of dentry being moved
* @old_dentry: dentry being moved
* @new_dir: destination parent
* @new_dentry: destination dentry
*
+ * This API preserves the directory offset values. Caller provides
+ * appropriate serialization.
+ *
* Returns zero on success. Otherwise a negative errno is returned and the
* rename is rolled back.
*/
@@ -369,11 +414,11 @@ int simple_offset_rename_exchange(struct inode *old_dir,
simple_offset_remove(old_ctx, old_dentry);
simple_offset_remove(new_ctx, new_dentry);
- ret = simple_offset_add(new_ctx, old_dentry);
+ ret = simple_offset_replace(new_ctx, old_dentry, new_index);
if (ret)
goto out_restore;
- ret = simple_offset_add(old_ctx, new_dentry);
+ ret = simple_offset_replace(old_ctx, new_dentry, old_index);
if (ret) {
simple_offset_remove(new_ctx, old_dentry);
goto out_restore;
@@ -388,10 +433,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
return 0;
out_restore:
- offset_set(old_dentry, old_index);
- mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
- offset_set(new_dentry, new_index);
- mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
+ (void)simple_offset_replace(old_ctx, old_dentry, old_index);
+ (void)simple_offset_replace(new_ctx, new_dentry, new_index);
return ret;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7cbd2b9f4d11..7f9a2d8aa420 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -20,11 +20,11 @@
#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
+#include <linux/fs_context.h>
static int minix_write_inode(struct inode *inode,
struct writeback_control *wbc);
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int minix_remount (struct super_block * sb, int * flags, char * data);
static void minix_evict_inode(struct inode *inode)
{
@@ -111,19 +111,19 @@ static const struct super_operations minix_sops = {
.evict_inode = minix_evict_inode,
.put_super = minix_put_super,
.statfs = minix_statfs,
- .remount_fs = minix_remount,
};
-static int minix_remount (struct super_block * sb, int * flags, char * data)
+static int minix_reconfigure(struct fs_context *fc)
{
- struct minix_sb_info * sbi = minix_sb(sb);
struct minix_super_block * ms;
+ struct super_block *sb = fc->root->d_sb;
+ struct minix_sb_info * sbi = sb->s_fs_info;
sync_filesystem(sb);
ms = sbi->s_ms;
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
if (ms->s_state & MINIX_VALID_FS ||
!(sbi->s_mount_state & MINIX_VALID_FS))
return 0;
@@ -170,7 +170,7 @@ static bool minix_check_superblock(struct super_block *sb)
return true;
}
-static int minix_fill_super(struct super_block *s, void *data, int silent)
+static int minix_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh;
struct buffer_head **map;
@@ -180,6 +180,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
struct inode *root_inode;
struct minix_sb_info *sbi;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
if (!sbi)
@@ -371,6 +372,23 @@ out:
return ret;
}
+static int minix_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, minix_fill_super);
+}
+
+static const struct fs_context_operations minix_context_ops = {
+ .get_tree = minix_get_tree,
+ .reconfigure = minix_reconfigure,
+};
+
+static int minix_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &minix_context_ops;
+
+ return 0;
+}
+
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -680,18 +698,12 @@ void minix_truncate(struct inode * inode)
V2_minix_truncate(inode);
}
-static struct dentry *minix_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
-}
-
static struct file_system_type minix_fs_type = {
- .owner = THIS_MODULE,
- .name = "minix",
- .mount = minix_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "minix",
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = minix_init_fs_context,
};
MODULE_ALIAS_FS("minix");
diff --git a/fs/namei.c b/fs/namei.c
index c5b2a25be7d0..cb5dde0e309f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2422,6 +2422,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
if (!f.file)
return ERR_PTR(-EBADF);
+ if (flags & LOOKUP_LINKAT_EMPTY) {
+ if (f.file->f_cred != current_cred() &&
+ !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+ fdput(f);
+ return ERR_PTR(-ENOENT);
+ }
+ }
+
dentry = f.file->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
@@ -4641,14 +4649,13 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
goto out_putnames;
}
/*
- * To use null names we require CAP_DAC_READ_SEARCH
+ * To use null names we require CAP_DAC_READ_SEARCH or
+ * that the open-time creds of the dfd matches current.
* This ensures that not everyone will be able to create
- * handlink using the passed filedescriptor.
+ * a hardlink using the passed file descriptor.
*/
- if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
- error = -ENOENT;
- goto out_putnames;
- }
+ if (flags & AT_EMPTY_PATH)
+ how |= LOOKUP_LINKAT_EMPTY;
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d4d1d799819e..8e6781e0b10b 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -11,7 +11,8 @@ netfs-y := \
main.o \
misc.o \
objects.o \
- output.o
+ write_collect.o \
+ write_issue.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3298c29b5548..a6bb03bea920 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -10,8 +10,11 @@
#include "internal.h"
/*
- * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * Unlock the folios in a read operation. We need to set PG_writeback on any
* folios we're going to write back before we unlock them.
+ *
+ * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
+ * PG_private_2 and do a direct write to the cache from here instead.
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
@@ -48,14 +51,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
- bool folio_started;
+ bool wback_to_cache = false;
+ bool folio_started = false;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
- folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -63,10 +66,16 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_start_fscache(folio);
- folio_started = true;
+ if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
+ &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_start_private_2(folio);
+ folio_started = true;
+ }
+ } else {
+ wback_to_cache |=
+ test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
}
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
@@ -98,6 +107,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
kfree(finfo);
}
folio_mark_uptodate(folio);
+ if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ filemap_dirty_folio(folio->mapping, folio);
+ }
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -116,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
}
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
+ unsigned long long *_start,
+ unsigned long long *_len,
+ unsigned long long i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
@@ -266,7 +282,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
- netfs_stat(&netfs_n_rh_readpage);
+ netfs_stat(&netfs_n_rh_read_folio);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
@@ -450,7 +466,7 @@ retry:
if (!netfs_is_cache_enabled(ctx) &&
netfs_skip_folio_read(folio, pos, len, false)) {
netfs_stat(&netfs_n_rh_write_zskip);
- goto have_folio_no_wait;
+ goto have_folio;
}
rreq = netfs_alloc_request(mapping, file,
@@ -491,10 +507,6 @@ retry:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
- ret = folio_wait_fscache_killable(folio);
- if (ret < 0)
- goto error;
-have_folio_no_wait:
*_folio = folio;
_leave(" = 0");
return 0;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 9a0d32e4b422..1121601536d1 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
+/* Network filesystem high-level buffered write support.
*
* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -26,25 +26,15 @@ enum netfs_how_to_modify {
NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
};
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
-
static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- if (netfs_group && !folio_get_private(folio))
- folio_attach_private(folio, netfs_get_group(netfs_group));
-}
+ void *priv = folio_get_private(folio);
-#if IS_ENABLED(CONFIG_FSCACHE)
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
+ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
}
-#endif
/*
* Decide how we should modify a folio. We might be attempting to do
@@ -63,11 +53,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
bool maybe_trouble)
{
struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
loff_t pos = folio_file_pos(folio);
_enter("");
- if (netfs_folio_group(folio) != netfs_group)
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
if (folio_test_uptodate(folio))
@@ -81,16 +72,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
if (file->f_mode & FMODE_READ)
goto no_write_streaming;
- if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- goto no_write_streaming;
if (netfs_is_cache_enabled(ctx)) {
/* We don't want to get a streaming write on a file that loses
* caching service temporarily because the backing store got
* culled.
*/
- if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
goto no_write_streaming;
}
@@ -130,6 +117,37 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+/*
+ * Update i_size and estimate the update to i_blocks to reflect the additional
+ * data written into the pagecache until we can find out from the server what
+ * the values actually are.
+ */
+static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t i_size, loff_t pos, size_t copied)
+{
+ blkcnt_t add;
+ size_t gap;
+
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ return;
+ }
+
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(pos, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
+}
+
/**
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
@@ -160,11 +178,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
};
struct netfs_io_request *wreq = NULL;
struct netfs_folio *finfo;
- struct folio *folio;
+ struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
- ssize_t written = 0, ret;
+ ssize_t written = 0, ret, ret2;
loff_t i_size, pos = iocb->ki_pos, from, to;
size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
bool maybe_trouble = false;
@@ -172,15 +190,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
) {
- if (pos < i_size_read(inode)) {
- ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
- if (ret < 0) {
- goto out;
- }
- }
-
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ wbc_detach_inode(&wbc);
+ goto out;
+ }
+
wreq = netfs_begin_writethrough(iocb, iter->count);
if (IS_ERR(wreq)) {
wbc_detach_inode(&wbc);
@@ -190,7 +207,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (!is_sync_kiocb(iocb))
wreq->iocb = iocb;
- wreq->cleanup = netfs_cleanup_buffered_write;
+ netfs_stat(&netfs_n_wh_writethrough);
+ } else {
+ netfs_stat(&netfs_n_wh_buffered_write);
}
do {
@@ -231,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
offset = pos & (flen - 1);
part = min_t(size_t, flen - offset, part);
+ /* Wait for writeback to complete. The writeback engine owns
+ * the info in folio->private and may change it until it
+ * removes the WB mark.
+ */
+ if (folio_get_private(folio) &&
+ folio_wait_writeback_killable(folio)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
if (signal_pending(current)) {
ret = written ? -EINTR : -ERESTARTSYS;
goto error_folio_unlock;
@@ -305,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
maybe_trouble = true;
iov_iter_revert(iter, copied);
copied = 0;
+ folio_unlock(folio);
goto retry;
}
netfs_set_group(folio, netfs_group);
@@ -352,41 +382,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
trace_netfs_folio(folio, trace);
/* Update the inode size if we moved the EOF marker */
- i_size = i_size_read(inode);
pos += copied;
- if (pos > i_size) {
- if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
- } else {
- i_size_write(inode, pos);
-#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
-#endif
- }
- }
+ i_size = i_size_read(inode);
+ if (pos > i_size)
+ netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
folio_mark_dirty(folio);
+ folio_unlock(folio);
} else {
- if (folio_test_dirty(folio))
- /* Sigh. mmap. */
- folio_clear_dirty_for_io(folio);
- /* We make multiple writes to the folio... */
- if (!folio_test_writeback(folio)) {
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
- folio_start_fscache(folio);
- if (wreq->iter.count == 0)
- trace_netfs_folio(folio, netfs_folio_trace_wthru);
- else
- trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
- }
- netfs_advance_writethrough(wreq, copied,
- offset + copied == flen);
+ netfs_advance_writethrough(wreq, &wbc, folio, copied,
+ offset + copied == flen,
+ &writethrough);
+ /* Folio unlocked */
}
retry:
- folio_unlock(folio);
folio_put(folio);
folio = NULL;
@@ -394,11 +405,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
} while (iov_iter_count(iter));
out:
+ if (likely(written) && ctx->ops->post_modify)
+ ctx->ops->post_modify(inode);
+
if (unlikely(wreq)) {
- ret = netfs_end_writethrough(wreq, iocb);
+ ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
wbc_detach_inode(&wbc);
- if (ret == -EIOCBQUEUED)
- return ret;
+ if (ret2 == -EIOCBQUEUED)
+ return ret2;
+ if (ret == 0)
+ ret = ret2;
}
iocb->ki_pos += written;
@@ -504,9 +520,11 @@ EXPORT_SYMBOL(netfs_file_write_iter);
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
+ struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
+ struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
int err;
@@ -514,11 +532,13 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
sb_start_pagefault(inode->i_sb);
- if (folio_wait_writeback_killable(folio))
+ if (folio_lock_killable(folio) < 0)
goto out;
- if (folio_lock_killable(folio) < 0)
+ if (folio_wait_writeback_killable(folio)) {
+ ret = VM_FAULT_LOCKED;
goto out;
+ }
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
@@ -526,7 +546,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
goto out;
}
- if (netfs_folio_group(folio) != netfs_group) {
+ group = netfs_folio_group(folio);
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
err = filemap_fdatawait_range(inode->i_mapping,
folio_pos(folio),
@@ -550,708 +571,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
netfs_set_group(folio, netfs_group);
file_update_time(file);
+ if (ictx->ops->post_modify)
+ ictx->ops->post_modify(inode);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
return ret;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
-
-/*
- * Kill all the pages in the given range
- */
-static void netfs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- trace_netfs_folio(folio, netfs_folio_trace_kill);
- folio_clear_uptodate(folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void netfs_redirty_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
- trace_netfs_folio(folio, netfs_folio_trace_redirty);
- filemap_dirty_folio(mapping, folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- balance_dirty_pages_ratelimited(mapping);
-
- _leave("");
-}
-
-/*
- * Completion of write to server
- */
-static void netfs_pages_written_back(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
- struct netfs_folio *finfo;
- struct netfs_group *group = NULL;
- struct folio *folio;
- pgoff_t last;
- int gcount = 0;
-
- XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
-
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
-
- rcu_read_lock();
-
- last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, last) {
- WARN(!folio_test_writeback(folio),
- "bad %zx @%llx page %lx %lx\n",
- wreq->len, wreq->start, folio->index, last);
-
- if ((finfo = netfs_folio_info(folio))) {
- /* Streaming writes cannot be redirtied whilst under
- * writeback, so discard the streaming record.
- */
- folio_detach_private(folio);
- group = finfo->netfs_group;
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_s);
- kfree(finfo);
- } else if ((group = netfs_folio_group(folio))) {
- /* Need to detach the group pointer if the page didn't
- * get redirtied. If it has been redirtied, then it
- * must be within the same group.
- */
- if (folio_test_dirty(folio)) {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- goto end_wb;
- }
- if (folio_trylock(folio)) {
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- goto end_wb;
- }
-
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- rcu_read_lock();
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_clear);
- }
- end_wb:
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- xas_advance(&xas, folio_next_index(folio) - 1);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
- netfs_put_group_many(group, gcount);
- _leave("");
-}
-
-/*
- * Deal with the disposition of the folios that are under writeback to close
- * out the operation.
- */
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
-
- _enter("");
-
- switch (wreq->error) {
- case 0:
- netfs_pages_written_back(wreq);
- break;
-
- default:
- pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- case -EDQUOT:
- case -ENOSPC:
- netfs_redirty_pages(mapping, wreq->start, wreq->len);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- netfs_kill_pages(mapping, wreq->start, wreq->len);
- break;
- }
-
- if (wreq->error)
- mapping_set_error(mapping, wreq->error);
- if (wreq->netfs_ops->done)
- wreq->netfs_ops->done(wreq);
-}
-
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void netfs_extend_writeback(struct address_space *mapping,
- struct netfs_group *group,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool caching,
- size_t *_len,
- size_t *_top)
-{
- struct netfs_folio *finfo;
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned int i;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- void *priv;
- bool stop = true;
-
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- /* Has the folio moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- stop = false;
- len = folio_size(folio);
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- stop = true;
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group ||
- finfo->dirty_offset > 0) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- len = finfo->dirty_len;
- }
-
- *_top += folio_size(folio);
- index += folio_nr_pages(folio);
- *_count -= folio_nr_pages(folio);
- *_len += len;
- if (*_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_netfs_folio(folio, netfs_folio_trace_store_plus);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct netfs_io_request *wreq;
- struct netfs_folio *finfo;
- struct netfs_inode *ctx = netfs_inode(mapping->host);
- unsigned long long i_size = i_size_read(&ctx->inode);
- size_t len, max_len;
- bool caching = netfs_is_cache_enabled(ctx);
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
-
- wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
- NETFS_WRITEBACK);
- if (IS_ERR(wreq)) {
- folio_unlock(folio);
- return PTR_ERR(wreq);
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- trace_netfs_folio(folio, netfs_folio_trace_store);
-
- len = wreq->len;
- finfo = netfs_folio_info(folio);
- if (finfo) {
- start += finfo->dirty_offset;
- if (finfo->dirty_offset + finfo->dirty_len != len) {
- len = finfo->dirty_len;
- goto cant_expand;
- }
- len = finfo->dirty_len;
- }
-
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len)
- netfs_extend_writeback(mapping, group, xas, &count, start,
- max_len, caching, &len, &wreq->upper_len);
- }
-
-cant_expand:
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
- wreq->start = start;
- wreq->len = len;
-
- if (start < i_size) {
- _debug("write back %zx @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- wreq->cleanup = netfs_cleanup_buffered_write;
-
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
- wreq->upper_len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
- if (ret == 0 || ret == -EIOCBQUEUED)
- wbc->nr_to_write -= len / PAGE_SIZE;
- } else {
- _debug("write discard %zx @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- netfs_pages_written_back(wreq);
- ret = 0;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = 1");
- return 1;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static ssize_t netfs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- const struct netfs_folio *finfo;
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- void *priv;
- int skips = 0;
-
- _enter("%llx,%llx,", start, end);
-
-search_again:
- /* Find the first dirty page in the group. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- /* Skip any dirty folio that's not in the group of interest. */
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group) {
- folio_put(folio);
- continue;
- }
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio->index);
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
- folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- _leave(" = %zd [%llx]", ret, *_start);
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int netfs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = netfs_writepages_begin(mapping, wbc, group, &xas,
- _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * write some of the pending data back to the server
- */
-int netfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct netfs_group *group = NULL;
- loff_t start, end;
- int ret;
-
- _enter("");
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = netfs_writepages_region(mapping, wbc, group, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, wbc->range_end);
- }
-
-out:
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_writepages);
-
-/*
- * Deal with the disposition of a laundered folio.
- */
-static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
-{
- if (wreq->error) {
- pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
- mapping_set_error(wreq->mapping, wreq->error);
- }
-}
-
-/**
- * netfs_launder_folio - Clean up a dirty folio that's being invalidated
- * @folio: The folio to clean
- *
- * This is called to write back a folio that's being invalidated when an inode
- * is getting torn down. Ideally, writepages would be used instead.
- */
-int netfs_launder_folio(struct folio *folio)
-{
- struct netfs_io_request *wreq;
- struct address_space *mapping = folio->mapping;
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct netfs_group *group = netfs_folio_group(folio);
- struct bio_vec bvec;
- unsigned long long i_size = i_size_read(mapping->host);
- unsigned long long start = folio_pos(folio);
- size_t offset = 0, len;
- int ret = 0;
-
- if (finfo) {
- offset = finfo->dirty_offset;
- start += offset;
- len = finfo->dirty_len;
- } else {
- len = folio_size(folio);
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
- if (IS_ERR(wreq)) {
- ret = PTR_ERR(wreq);
- goto out;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- goto out_put;
-
- trace_netfs_folio(folio, netfs_folio_trace_launder);
-
- _debug("launder %llx-%llx", start, start + len - 1);
-
- /* Speculatively write to the cache. We have to fix this up later if
- * the store fails.
- */
- wreq->cleanup = netfs_cleanup_launder_folio;
-
- bvec_set_folio(&bvec, folio, len, offset);
- iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
-
-out_put:
- folio_detach_private(folio);
- netfs_put_group(group);
- kfree(finfo);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-out:
- folio_wait_fscache(folio);
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bee047e20f5d..608ba6416919 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -34,6 +34,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
unsigned long long start = iocb->ki_pos;
unsigned long long end = start + iov_iter_count(iter);
ssize_t ret, n;
+ size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
_enter("");
@@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
_debug("uw %llx-%llx", start, end);
- wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
- start, end - start,
- iocb->ki_flags & IOCB_DIRECT ?
- NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
if (IS_ERR(wreq))
return PTR_ERR(wreq);
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write));
+
{
/* If this is an async op and we're not using a bounce buffer,
* we have to save the source buffer as the iterator is only
@@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
* request.
*/
if (async || user_backed_iter(iter)) {
- n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
if (n < 0) {
ret = n;
goto out;
@@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
- wreq->len = iov_iter_count(&wreq->iter);
} else {
wreq->iter = *iter;
}
@@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->io_iter = wreq->iter;
}
+ __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -87,10 +93,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
if (async)
wreq->iocb = iocb;
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
- iocb->ki_flags & IOCB_DIRECT ?
- netfs_write_trace_dio_write :
- netfs_write_trace_unbuffered_write);
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
@@ -100,9 +103,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
-
+ smp_rmb(); /* Read error/transferred after RIP flag */
ret = wreq->error;
- _debug("waited = %zd", ret);
if (ret == 0) {
ret = wreq->transferred;
iocb->ki_pos += ret;
@@ -132,18 +134,20 @@ out:
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct netfs_inode *ictx = netfs_inode(inode);
- unsigned long long end;
ssize_t ret;
+ loff_t pos = iocb->ki_pos;
+ unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
trace_netfs_write_iter(iocb, from);
- netfs_stat(&netfs_n_rh_dio_write);
+ netfs_stat(&netfs_n_wh_dio_write);
ret = netfs_start_io_direct(inode);
if (ret < 0)
@@ -157,7 +161,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = file_update_time(file);
if (ret < 0)
goto out;
- ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* We could block if there are any pages in the range. */
+ ret = -EAGAIN;
+ if (filemap_range_has_page(mapping, pos, end))
+ if (filemap_invalidate_inode(inode, true, pos, end))
+ goto out;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ ret = filemap_invalidate_inode(inode, true, pos, end);
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 43a651ed8264..38637e5c9b57 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -166,6 +166,7 @@ struct fscache_write_request {
loff_t start;
size_t len;
bool set_bits;
+ bool using_pgpriv2;
netfs_io_terminated_t term_func;
void *term_func_priv;
};
@@ -182,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, last) {
- end_page_fscache(page);
+ folio_end_private_2(page_folio(page));
}
rcu_read_unlock();
}
@@ -197,8 +198,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
- wreq->set_bits);
+ if (wreq->using_pgpriv2)
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+ wreq->set_bits);
if (wreq->term_func)
wreq->term_func(wreq->term_func_priv, transferred_or_error,
@@ -212,7 +214,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
loff_t start, size_t len, loff_t i_size,
netfs_io_terminated_t term_func,
void *term_func_priv,
- bool cond)
+ bool using_pgpriv2, bool cond)
{
struct fscache_write_request *wreq;
struct netfs_cache_resources *cres;
@@ -230,6 +232,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
wreq->mapping = mapping;
wreq->start = start;
wreq->len = len;
+ wreq->using_pgpriv2 = using_pgpriv2;
wreq->set_bits = cond;
wreq->term_func = term_func;
wreq->term_func_priv = term_func_priv;
@@ -257,7 +260,8 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(mapping, start, len, cond);
+ if (using_pgpriv2)
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ec7045d24400..95e281a8af78 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -37,6 +37,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
+extern mempool_t netfs_request_pool;
+extern mempool_t netfs_subrequest_pool;
#ifdef CONFIG_PROC_FS
static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
@@ -91,22 +93,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
- * output.c
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what);
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
-
-/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
extern atomic_t netfs_n_rh_dio_read;
-extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
-extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_read_folio;
extern atomic_t netfs_n_rh_rreq;
extern atomic_t netfs_n_rh_sreq;
extern atomic_t netfs_n_rh_download;
@@ -123,6 +115,10 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_buffered_write;
+extern atomic_t netfs_n_wh_writethrough;
+extern atomic_t netfs_n_wh_dio_write;
+extern atomic_t netfs_n_wh_writepages;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@@ -149,6 +145,33 @@ static inline void netfs_stat_d(atomic_t *stat)
#endif
/*
+ * write_collect.c
+ */
+int netfs_folio_written_back(struct folio *folio);
+void netfs_write_collection_worker(struct work_struct *work);
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
+
+/*
+ * write_issue.c
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin);
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq);
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
+
+/*
* Miscellaneous functions.
*/
static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
@@ -168,7 +191,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
*/
static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
{
- if (netfs_group)
+ if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
refcount_inc(&netfs_group->ref);
return netfs_group;
}
@@ -178,7 +201,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou
*/
static inline void netfs_put_group(struct netfs_group *netfs_group)
{
- if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_dec_and_test(&netfs_group->ref))
netfs_group->free(netfs_group);
}
@@ -187,7 +212,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group)
*/
static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
{
- if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_sub_and_test(nr, &netfs_group->ref))
netfs_group->free(netfs_group);
}
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4261ad6c55b6..c93851b98368 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -99,145 +99,6 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
}
/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct folio *folio;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- if (xas_retry(&xas, folio))
- continue;
-
- /* We might have multiple writes from the same huge
- * folio, but we mustn't unlock a folio more than once.
- */
- if (have_unlocked && folio->index <= unlocked)
- continue;
- unlocked = folio_next_index(folio) - 1;
- trace_netfs_folio(folio, netfs_folio_trace_end_copy);
- folio_end_fscache(folio);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_io_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_copy_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false,
- netfs_sreq_trace_put_no_copy);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false,
- netfs_sreq_trace_put_merged);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- subreq->len, rreq->i_size, true);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_copy_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
-{
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
-}
-
-/*
* Handle a short read.
*/
static void netfs_rreq_short_read(struct netfs_io_request *rreq,
@@ -352,8 +213,13 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
unsigned int i;
size_t transferred = 0;
- for (i = 0; i < rreq->direct_bv_count; i++)
+ for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->error || subreq->transferred == 0)
@@ -409,9 +275,6 @@ again:
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
- if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq);
-
netfs_rreq_completed(rreq, was_async);
}
@@ -618,7 +481,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
set:
if (subreq->len > rreq->len)
- pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
rreq->debug_id, subreq->debug_index,
subreq->len, rreq->len);
@@ -643,8 +506,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
- struct iov_iter *io_iter,
- unsigned int *_debug_index)
+ struct iov_iter *io_iter)
{
struct netfs_io_subrequest *subreq;
enum netfs_io_source source;
@@ -653,11 +515,10 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
if (!subreq)
return false;
- subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
@@ -707,7 +568,6 @@ subreq_failed:
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
struct iov_iter io_iter;
- unsigned int debug_index = 0;
int ret;
_enter("R=%x %llx-%llx",
@@ -733,12 +593,12 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
- _debug("submit %llx + %zx >= %llx",
+ _debug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (rreq->origin == NETFS_DIO_READ &&
rreq->start + rreq->submitted >= rreq->i_size)
break;
- if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ if (!netfs_rreq_submit_slice(rreq, &io_iter))
break;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5e77618a7940..5f0f438e5d21 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/mempool.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -23,6 +24,11 @@ unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+static struct kmem_cache *netfs_request_slab;
+static struct kmem_cache *netfs_subrequest_slab;
+mempool_t netfs_request_pool;
+mempool_t netfs_subrequest_pool;
+
#ifdef CONFIG_PROC_FS
LIST_HEAD(netfs_io_requests);
DEFINE_SPINLOCK(netfs_proc_lock);
@@ -31,9 +37,9 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_COPY_TO_CACHE] = "CC",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITETHROUGH] = "WT",
- [NETFS_LAUNDER_WRITE] = "LW",
[NETFS_UNBUFFERED_WRITE] = "UW",
[NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
@@ -56,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
@@ -98,25 +104,54 @@ static int __init netfs_init(void)
{
int ret = -ENOMEM;
+ netfs_request_slab = kmem_cache_create("netfs_request",
+ sizeof(struct netfs_io_request), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_request_slab)
+ goto error_req;
+
+ if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0)
+ goto error_reqpool;
+
+ netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
+ sizeof(struct netfs_io_subrequest), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_subrequest_slab)
+ goto error_subreq;
+
+ if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
+ goto error_subreqpool;
+
if (!proc_mkdir("fs/netfs", NULL))
- goto error;
+ goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
- goto error_proc;
+ goto error_procfile;
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
- goto error_proc;
+ goto error_procfile;
#endif
ret = fscache_init();
if (ret < 0)
- goto error_proc;
+ goto error_fscache;
return 0;
-error_proc:
+error_fscache:
+error_procfile:
remove_proc_entry("fs/netfs", NULL);
-error:
+error_proc:
+ mempool_exit(&netfs_subrequest_pool);
+error_subreqpool:
+ kmem_cache_destroy(netfs_subrequest_slab);
+error_subreq:
+ mempool_exit(&netfs_request_pool);
+error_reqpool:
+ kmem_cache_destroy(netfs_request_slab);
+error_req:
return ret;
}
fs_initcall(netfs_init);
@@ -125,5 +160,9 @@ static void __exit netfs_exit(void)
{
fscache_exit();
remove_proc_entry("fs/netfs", NULL);
+ mempool_exit(&netfs_subrequest_pool);
+ kmem_cache_destroy(netfs_subrequest_slab);
+ mempool_exit(&netfs_request_pool);
+ kmem_cache_destroy(netfs_request_slab);
}
module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 90051ced8e2a..bc1fc54fb724 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -177,13 +177,11 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
*/
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct netfs_folio *finfo = NULL;
+ struct netfs_folio *finfo;
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
- folio_wait_fscache(folio);
-
if (!folio_test_private(folio))
return;
@@ -248,12 +246,6 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_private(folio))
return false;
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
-
fscache_note_page_release(netfs_i_cookie(ctx));
return true;
}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 610ceb5bd86c..c90d482b1650 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -6,6 +6,8 @@
*/
#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
#include "internal.h"
/*
@@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
+ struct kmem_cache *cache = mempool->pool_data;
bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
origin == NETFS_DIO_READ ||
origin == NETFS_DIO_WRITE);
bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
- GFP_KERNEL);
- if (!rreq)
- return ERR_PTR(-ENOMEM);
+ for (;;) {
+ rreq = mempool_alloc(mempool, GFP_KERNEL);
+ if (rreq)
+ break;
+ msleep(10);
+ }
+ memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
rreq->upper_len = len;
@@ -40,19 +47,27 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->inode = inode;
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
+ rreq->wsize = INT_MAX;
+ spin_lock_init(&rreq->lock);
+ INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
+ INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (cached)
+ if (cached) {
__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
+ /* Filesystem uses deprecated PG_private_2 marking. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+ }
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
- kfree(rreq);
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
return ERR_PTR(ret);
}
}
@@ -74,6 +89,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
@@ -82,6 +99,25 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
+
+ for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
+ stream = &rreq->io_streams[s];
+ while (!list_empty(&stream->subrequests)) {
+ subreq = list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
+ }
+}
+
+static void netfs_free_request_rcu(struct rcu_head *rcu)
+{
+ struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
+
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
+ netfs_stat_d(&netfs_n_rh_rreq);
}
static void netfs_free_request(struct work_struct *work)
@@ -106,8 +142,7 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
- kfree_rcu(rreq, rcu);
- netfs_stat_d(&netfs_n_rh_rreq);
+ call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
@@ -139,19 +174,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
-
- subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
- sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_WORK(&subreq->work, NULL);
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
- netfs_stat(&netfs_n_rh_sreq);
+ mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
+ struct kmem_cache *cache = mempool->pool_data;
+
+ for (;;) {
+ subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
+ GFP_KERNEL);
+ if (subreq)
+ break;
+ msleep(10);
}
+ memset(subreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&subreq->work, NULL);
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
return subreq;
}
@@ -173,7 +214,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
if (rreq->netfs_ops->free_subrequest)
rreq->netfs_ops->free_subrequest(subreq);
- kfree(subreq);
+ mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
}
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
deleted file mode 100644
index 625eb68f3e5a..000000000000
--- a/fs/netfs/output.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
- *
- * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
-#include "internal.h"
-
-/**
- * netfs_create_write_request - Create a write operation.
- * @wreq: The write request this is storing from.
- * @dest: The destination type
- * @start: Start of the region this write will modify
- * @len: Length of the modification
- * @worker: The worker function to handle the write(s)
- *
- * Allocate a write operation, set it up and add it to the list on a write
- * request.
- */
-struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
- enum netfs_io_source dest,
- loff_t start, size_t len,
- work_func_t worker)
-{
- struct netfs_io_subrequest *subreq;
-
- subreq = netfs_alloc_subrequest(wreq);
- if (subreq) {
- INIT_WORK(&subreq->work, worker);
- subreq->source = dest;
- subreq->start = start;
- subreq->len = len;
- subreq->debug_index = wreq->subreq_counter++;
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write);
- break;
- default:
- BUG();
- }
-
- subreq->io_iter = wreq->io_iter;
- iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
- iov_iter_truncate(&subreq->io_iter, subreq->len);
-
- trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
- refcount_read(&subreq->ref),
- netfs_sreq_trace_new);
- atomic_inc(&wreq->nr_outstanding);
- list_add_tail(&subreq->rreq_link, &wreq->subrequests);
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- }
-
- return subreq;
-}
-EXPORT_SYMBOL(netfs_create_write_request);
-
-/*
- * Process a completed write request once all the component operations have
- * been completed.
- */
-static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- size_t transferred = 0;
-
- _enter("R=%x[]", wreq->debug_id);
-
- trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
- wreq->transferred = transferred;
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (!subreq->error)
- continue;
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- /* Depending on the type of failure, this may prevent
- * writeback completion unless we're in disconnected
- * mode.
- */
- if (!wreq->error)
- wreq->error = subreq->error;
- break;
-
- case NETFS_WRITE_TO_CACHE:
- /* Failure doesn't prevent writeback completion unless
- * we're in disconnected mode.
- */
- if (subreq->error != -ENOBUFS)
- ctx->ops->invalidate_cache(wreq);
- break;
-
- default:
- WARN_ON_ONCE(1);
- if (!wreq->error)
- wreq->error = -EIO;
- return;
- }
- }
-
- wreq->cleanup(wreq);
-
- if (wreq->origin == NETFS_DIO_WRITE &&
- wreq->mapping->nrpages) {
- pgoff_t first = wreq->start >> PAGE_SHIFT;
- pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
- invalidate_inode_pages2_range(wreq->mapping, first, last);
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_end(wreq->inode);
-
- _debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
- wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (wreq->iocb) {
- wreq->iocb->ki_pos += transferred;
- if (wreq->iocb->ki_complete)
- wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : transferred);
- }
-
- netfs_clear_subrequests(wreq, was_async);
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = _op;
- struct netfs_io_request *wreq = subreq->rreq;
- unsigned int u;
-
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_done);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_done);
- break;
- case NETFS_INVALID_WRITE:
- break;
- default:
- BUG();
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(wreq, subreq, transferred_or_error,
- netfs_fail_write);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq excess write: R%x[%x] %zd > %zu - %zu",
- wreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
-
- if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
- wreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->io_iter.iter_type);
-
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&wreq->nr_outstanding);
- if (u == 0)
- netfs_write_terminated(wreq, was_async);
- else if (u == 1)
- wake_up_var(&wreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- goto out;
-
-failed:
- switch (subreq->source) {
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- break;
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_failed);
- set_bit(NETFS_RREQ_FAILED, &wreq->flags);
- wreq->error = subreq->error;
- break;
- default:
- break;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_write_subrequest_terminated);
-
-static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
-{
- struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_cache_resources *cres = &wreq->cache_resources;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-
- cres->ops->write(cres, subreq->start, &subreq->io_iter,
- netfs_write_subrequest_terminated, subreq);
-}
-
-static void netfs_write_to_cache_op_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- netfs_write_to_cache_op(subreq);
-}
-
-/**
- * netfs_queue_write_request - Queue a write request for attention
- * @subreq: The write request to be queued
- *
- * Queue the specified write request for processing by a worker thread. We
- * pass the caller's ref on the request to the worker thread.
- */
-void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
-{
- if (!queue_work(system_unbound_wq, &subreq->work))
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
-}
-EXPORT_SYMBOL(netfs_queue_write_request);
-
-/*
- * Set up a op for writing to the cache.
- */
-static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
-{
- struct netfs_cache_resources *cres = &wreq->cache_resources;
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- struct fscache_cookie *cookie = netfs_i_cookie(ctx);
- loff_t start = wreq->start;
- size_t len = wreq->len;
- int ret;
-
- if (!fscache_cookie_enabled(cookie)) {
- clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
- return;
- }
-
- _debug("write to cache");
- ret = fscache_begin_write_operation(cres, cookie);
- if (ret < 0)
- return;
-
- ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
- i_size_read(wreq->inode), true);
- if (ret < 0)
- return;
-
- subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
- netfs_write_to_cache_op_worker);
- if (!subreq)
- return;
-
- netfs_write_to_cache_op(subreq);
-}
-
-/*
- * Begin the process of writing out a chunk of data.
- *
- * We are given a write request that holds a series of dirty regions and
- * (partially) covers a sequence of folios, all of which are present. The
- * pages must have been marked as writeback as appropriate.
- *
- * We need to perform the following steps:
- *
- * (1) If encrypting, create an output buffer and encrypt each block of the
- * data into it, otherwise the output buffer will point to the original
- * folios.
- *
- * (2) If the data is to be cached, set up a write op for the entire output
- * buffer to the cache, if the cache wants to accept it.
- *
- * (3) If the data is to be uploaded (ie. not merely cached):
- *
- * (a) If the data is to be compressed, create a compression buffer and
- * compress the data into it.
- *
- * (b) For each destination we want to upload to, set up write ops to write
- * to that destination. We may need multiple writes if the data is not
- * contiguous or the span exceeds wsize for a server.
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what)
-{
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
-
- _enter("R=%x %llx-%llx f=%lx",
- wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
- wreq->flags);
-
- trace_netfs_write(wreq, what);
- if (wreq->len == 0 || wreq->iter.count == 0) {
- pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
- return -EIO;
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_begin(wreq->inode);
-
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
-
- /* Start the encryption/compression going. We can do that in the
- * background whilst we generate a list of write ops that we want to
- * perform.
- */
- // TODO: Encrypt or compress the region as appropriate
-
- /* We need to write all of the region to the cache */
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
- netfs_set_up_write_to_cache(wreq);
-
- /* However, we don't necessarily write all of the region to the server.
- * Caching of reads is being managed this way also.
- */
- if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (!may_wait)
- return -EIOCBQUEUED;
-
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- return wreq->error;
-}
-
-/*
- * Begin a write operation for writing through the pagecache.
- */
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
-{
- struct netfs_io_request *wreq;
- struct file *file = iocb->ki_filp;
-
- wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
- NETFS_WRITETHROUGH);
- if (IS_ERR(wreq))
- return wreq;
-
- trace_netfs_write(wreq, netfs_write_trace_writethrough);
-
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
- return wreq;
-}
-
-static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
-{
- struct netfs_inode *ictx = netfs_inode(wreq->inode);
- unsigned long long start;
- size_t len;
-
- if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- return;
-
- start = wreq->start + wreq->submitted;
- len = wreq->iter.count - wreq->submitted;
- if (!final) {
- len /= wreq->wsize; /* Round to number of maximum packets */
- len *= wreq->wsize;
- }
-
- ictx->ops->create_write_requests(wreq, start, len);
- wreq->submitted += len;
-}
-
-/*
- * Advance the state of the write operation used when writing through the
- * pagecache. Data has been copied into the pagecache that we need to append
- * to the request. If we've added more than wsize then we need to create a new
- * subrequest.
- */
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
-{
- _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
-
- wreq->iter.count += copied;
- wreq->io_iter.count += copied;
- if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
- netfs_submit_writethrough(wreq, false);
-
- return wreq->error;
-}
-
-/*
- * End a write operation used when writing through the pagecache.
- */
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
-{
- int ret = -EIOCBQUEUED;
-
- _enter("ic=%zu sb=%zu ws=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize);
-
- if (wreq->submitted < wreq->io_iter.count)
- netfs_submit_writethrough(wreq, true);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (is_sync_kiocb(iocb)) {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- return ret;
-}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index deeba9f9dcf5..0892768eea32 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -10,9 +10,8 @@
#include "internal.h"
atomic_t netfs_n_rh_dio_read;
-atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
-atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_read_folio;
atomic_t netfs_n_rh_rreq;
atomic_t netfs_n_rh_sreq;
atomic_t netfs_n_rh_download;
@@ -29,6 +28,10 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_buffered_write;
+atomic_t netfs_n_wh_writethrough;
+atomic_t netfs_n_wh_dio_write;
+atomic_t netfs_n_wh_writepages;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@@ -39,13 +42,17 @@ atomic_t netfs_n_wh_write_failed;
int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
- atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
- atomic_read(&netfs_n_rh_readpage),
+ atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n",
+ atomic_read(&netfs_n_wh_buffered_write),
+ atomic_read(&netfs_n_wh_writethrough),
+ atomic_read(&netfs_n_wh_dio_write),
+ atomic_read(&netfs_n_wh_writepages));
seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
new file mode 100644
index 000000000000..60112e4b2c5e
--- /dev/null
+++ b/fs/netfs/write_collect.c
@@ -0,0 +1,808 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write subrequest result collection, assessment
+ * and retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING 0x01 /* A front op was still pending */
+#define SOME_EMPTY 0x02 /* One of more streams are empty */
+#define ALL_EMPTY 0x04 /* All streams are empty */
+#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */
+#define NEED_REASSESS 0x10 /* Need to loop round and reassess */
+#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */
+#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */
+#define BUFFERED 0x80 /* The pagecache needs cleaning up */
+#define NEED_RETRY 0x100 /* A front op requests retrying */
+#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */
+
+/*
+ * Successful completion of write of a folio to the server and/or cache. Note
+ * that we are not allowed to lock the folio here on pain of deadlocking with
+ * truncate.
+ */
+int netfs_folio_written_back(struct folio *folio)
+{
+ enum netfs_folio_trace why = netfs_folio_trace_clear;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ int gcount = 0;
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under writeback,
+ * so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ kfree(finfo);
+ why = netfs_folio_trace_clear_s;
+ goto end_wb;
+ }
+
+ if ((group = netfs_folio_group(folio))) {
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ why = netfs_folio_trace_clear_cc;
+ folio_detach_private(folio);
+ goto end_wb;
+ }
+
+ /* Need to detach the group pointer if the page didn't get
+ * redirtied. If it has been redirtied, then it must be within
+ * the same group.
+ */
+ why = netfs_folio_trace_redirtied;
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ why = netfs_folio_trace_clear_g;
+ }
+ }
+
+end_wb:
+ trace_netfs_folio(folio, why);
+ folio_end_writeback(folio);
+ return gcount;
+}
+
+/*
+ * Get hold of a folio we have under writeback. We don't want to get the
+ * refcount on it.
+ */
+static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
+{
+ XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
+ struct folio *folio;
+
+ rcu_read_lock();
+
+ for (;;) {
+ xas_reset(&xas);
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (!folio || xa_is_value(folio))
+ kdebug("R=%08x: folio %lx (%llx) not present",
+ wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
+ BUG_ON(!folio || xa_is_value(folio));
+
+ if (folio == xas_reload(&xas))
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (WARN_ONCE(!folio_test_writeback(folio),
+ "R=%08x: folio %lx is not under writeback\n",
+ wreq->debug_id, folio->index)) {
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+ }
+ return folio;
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+ unsigned long long collected_to,
+ unsigned int *notes)
+{
+ for (;;) {
+ struct folio *folio;
+ struct netfs_folio *finfo;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ finfo = netfs_folio_info(folio);
+ flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+ trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+ if (fpos + fsize > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, fpos + fsize,
+ netfs_contig_trace_unlock);
+ wreq->contiguity = fpos + fsize;
+ }
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ wreq->cleaned_to = fpos + fsize;
+ *notes |= MADE_PROGRESS;
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+}
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct list_head *next;
+
+ _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+ if (list_empty(&stream->subrequests))
+ return;
+
+ if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ wreq->netfs_ops->retry_request)
+ wreq->netfs_ops->retry_request(wreq, stream);
+
+ if (unlikely(stream->failed))
+ return;
+
+ /* If there's no renegotiation to do, just resend each failed subreq. */
+ if (!stream->prepare_write) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ }
+ }
+ return;
+ }
+
+ next = stream->subrequests.next;
+
+ do {
+ struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+ unsigned long long start, len;
+ size_t part;
+ bool boundary = false;
+
+ /* Go through the stream and find the next span of contiguous
+ * data that we then rejig (cifs, for example, needs the wsize
+ * renegotiating) and reissue.
+ */
+ from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ to = from;
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ return;
+
+ list_for_each_continue(next, &stream->subrequests) {
+ subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ if (subreq->start + subreq->transferred != start + len ||
+ test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ to = subreq;
+ len += to->len;
+ }
+
+ /* Work through the sublist. */
+ subreq = from;
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!len)
+ break;
+ /* Renegotiate max_len (wsize) */
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = part;
+ subreq->start = start;
+ subreq->transferred = 0;
+ len -= part;
+ start += part;
+ if (len && subreq == to &&
+ __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+ boundary = true;
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ if (subreq == to)
+ break;
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess; if we used the same number, then we're done.
+ */
+ if (!len) {
+ if (subreq == to)
+ continue;
+ list_for_each_entry_safe_from(subreq, tmp,
+ &stream->subrequests, rreq_link) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ if (subreq == to)
+ break;
+ }
+ continue;
+ }
+
+ /* We ran out of subrequests, so we need to allocate some more
+ * and insert them after.
+ */
+ do {
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = to->source;
+ subreq->start = start;
+ subreq->max_len = len;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
+ subreq->stream_nr = to->stream_nr;
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+ list_add(&subreq->rreq_link, &to->rreq_link);
+ to = list_next_entry(to, rreq_link);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = min(len, wreq->wsize);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = subreq->transferred + part;
+ len -= part;
+ start += part;
+ if (!len && boundary) {
+ __set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+ boundary = false;
+ }
+
+ netfs_reissue_write(stream, subreq);
+ if (!len)
+ break;
+
+ } while (len);
+
+ } while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it. If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+static void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
+
+ /* Wait for all outstanding I/O to quiesce before performing retries as
+ * we may need to renegotiate the I/O sizes.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ }
+ }
+
+ // TODO: Enc: Fetch changed partial pages
+ // TODO: Enc: Reencrypt content if needed.
+ // TODO: Enc: Wind back transferred point.
+ // TODO: Enc: Mark cache pages for retry.
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->need_retry) {
+ stream->need_retry = false;
+ netfs_retry_write_stream(wreq, stream);
+ }
+ }
+}
+
+/*
+ * Collect and assess the results of various write subrequests. We may need to
+ * retry some of the results - or even do an RMW cycle for content crypto.
+ *
+ * Note that we have a number of parallel, overlapping lists of subrequests,
+ * one to the server and one to the local cache for example, which may not be
+ * the same size or starting position and may not even correspond in boundary
+ * alignment.
+ */
+static void netfs_collect_write_results(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *front, *remove;
+ struct netfs_io_stream *stream;
+ unsigned long long collected_to;
+ unsigned int notes;
+ int s;
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ trace_netfs_collect(wreq);
+ trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
+
+reassess_streams:
+ smp_rmb();
+ collected_to = ULLONG_MAX;
+ if (wreq->origin == NETFS_WRITEBACK)
+ notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
+ else if (wreq->origin == NETFS_WRITETHROUGH)
+ notes = ALL_EMPTY | BUFFERED;
+ else
+ notes = ALL_EMPTY;
+
+ /* Remove completed subrequests from the front of the streams and
+ * advance the completion point on each stream. We stop when we hit
+ * something that's in progress. The issuer thread may be adding stuff
+ * to the tail whilst we're doing this.
+ *
+ * We must not, however, merge in discontiguities that span whole
+ * folios that aren't under writeback. This is made more complicated
+ * by the folios in the gap being of unpredictable sizes - if they even
+ * exist - but we don't want to look them up.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ loff_t rstart, rend;
+
+ stream = &wreq->io_streams[s];
+ /* Read active flag before list pointers */
+ if (!smp_load_acquire(&stream->active))
+ continue;
+
+ front = stream->front;
+ while (front) {
+ trace_netfs_collect_sreq(wreq, front);
+ //_debug("sreq [%x] %llx %zx/%zx",
+ // front->debug_index, front->start, front->transferred, front->len);
+
+ /* Stall if there may be a discontinuity. */
+ rstart = round_down(front->start, PAGE_SIZE);
+ if (rstart > wreq->contiguity) {
+ if (wreq->contiguity > stream->collected_to) {
+ trace_netfs_collect_gap(wreq, stream,
+ wreq->contiguity, 'D');
+ stream->collected_to = wreq->contiguity;
+ }
+ notes |= REASSESS_DISCONTIG;
+ break;
+ }
+ rend = round_up(front->start + front->len, PAGE_SIZE);
+ if (rend > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, rend,
+ netfs_contig_trace_collect);
+ wreq->contiguity = rend;
+ if (notes & REASSESS_DISCONTIG)
+ notes |= NEED_REASSESS;
+ }
+ notes &= ~MAYBE_DISCONTIG;
+
+ /* Stall if the front is still undergoing I/O. */
+ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ notes |= HIT_PENDING;
+ break;
+ }
+ smp_rmb(); /* Read counters after I-P flag. */
+
+ if (stream->failed) {
+ stream->collected_to = front->start + front->len;
+ notes |= MADE_PROGRESS | SAW_FAILURE;
+ goto cancel;
+ }
+ if (front->start + front->transferred > stream->collected_to) {
+ stream->collected_to = front->start + front->transferred;
+ stream->transferred = stream->collected_to - wreq->start;
+ notes |= MADE_PROGRESS;
+ }
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ stream->failed = true;
+ stream->error = front->error;
+ if (stream->source == NETFS_UPLOAD_TO_SERVER)
+ mapping_set_error(wreq->mapping, front->error);
+ notes |= NEED_REASSESS | SAW_FAILURE;
+ break;
+ }
+ if (front->transferred < front->len) {
+ stream->need_retry = true;
+ notes |= NEED_RETRY | MADE_PROGRESS;
+ break;
+ }
+
+ cancel:
+ /* Remove if completely consumed. */
+ spin_lock(&wreq->lock);
+
+ remove = front;
+ list_del_init(&front->rreq_link);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ stream->front = front;
+ if (!front) {
+ unsigned long long jump_to = atomic64_read(&wreq->issued_to);
+
+ if (stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
+ stream->collected_to = jump_to;
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+ netfs_put_subrequest(remove, false,
+ notes & SAW_FAILURE ?
+ netfs_sreq_trace_put_cancel :
+ netfs_sreq_trace_put_done);
+ }
+
+ if (front)
+ notes &= ~ALL_EMPTY;
+ else
+ notes |= SOME_EMPTY;
+
+ if (stream->collected_to < collected_to)
+ collected_to = stream->collected_to;
+ }
+
+ if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
+ wreq->collected_to = collected_to;
+
+ /* If we have an empty stream, we need to jump it forward over any gap
+ * otherwise the collection point will never advance.
+ *
+ * Note that the issuer always adds to the stream with the lowest
+ * so-far submitted start, so if we see two consecutive subreqs in one
+ * stream with nothing between then in another stream, then the second
+ * stream has a gap that can be jumped.
+ */
+ if (notes & SOME_EMPTY) {
+ unsigned long long jump_to = wreq->start + wreq->len;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ !stream->front &&
+ stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
+ stream->collected_to = jump_to;
+ }
+ }
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active)
+ trace_netfs_collect_stream(wreq, stream);
+ }
+
+ trace_netfs_collect_state(wreq, wreq->collected_to, notes);
+
+ /* Unlock any folios that we have now finished with. */
+ if (notes & BUFFERED) {
+ unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
+
+ if (wreq->cleaned_to < clean_to)
+ netfs_writeback_unlock_folios(wreq, clean_to, &notes);
+ } else {
+ wreq->cleaned_to = wreq->collected_to;
+ }
+
+ // TODO: Discard encryption buffers
+
+ /* If all streams are discontiguous with the last folio we cleared, we
+ * may need to skip a set of folios.
+ */
+ if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
+ unsigned long long jump_to = ULLONG_MAX;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active && stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
+ wreq->contiguity = jump_to;
+ wreq->cleaned_to = jump_to;
+ wreq->collected_to = jump_to;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->collected_to < jump_to)
+ stream->collected_to = jump_to;
+ }
+ //cond_resched();
+ notes |= MADE_PROGRESS;
+ goto reassess_streams;
+ }
+
+ if (notes & NEED_RETRY)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+ clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+ }
+
+ if (notes & NEED_REASSESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+ if (notes & MADE_PROGRESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+
+out:
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ _leave(" = %x", notes);
+ return;
+
+need_retry:
+ /* Okay... We're going to have to retry one or both streams. Note
+ * that any partially completed op will have had any wholly transferred
+ * folios removed from it.
+ */
+ _debug("retry");
+ netfs_retry_writes(wreq);
+ goto out;
+}
+
+/*
+ * Perform the collection of subrequests, folios and encryption buffers.
+ */
+void netfs_write_collection_worker(struct work_struct *work)
+{
+ struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ size_t transferred;
+ int s;
+
+ _enter("R=%x", wreq->debug_id);
+
+ netfs_see_request(wreq, netfs_rreq_trace_see_work);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+
+ netfs_collect_write_results(wreq);
+
+ /* We're done when the app thread has finished posting subreqs and all
+ * the queues in all the streams are empty.
+ */
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ smp_rmb(); /* Read ALL_QUEUED before lists. */
+
+ transferred = LONG_MAX;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ struct netfs_io_stream *stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+ if (!list_empty(&stream->subrequests)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ if (stream->transferred < transferred)
+ transferred = stream->transferred;
+ }
+
+ /* Okay, declare that all I/O is complete. */
+ wreq->transferred = transferred;
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ if (wreq->io_streams[1].active &&
+ wreq->io_streams[1].failed) {
+ /* Cache write failure doesn't prevent writeback completion
+ * unless we're in disconnected mode.
+ */
+ ictx->ops->invalidate_cache(wreq);
+ }
+
+ if (wreq->cleanup)
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ /* mmap may have got underfoot and we may now have folios
+ * locally covering the region we just wrote. Attempt to
+ * discard the folios, but leave in place any modified locally.
+ * ->write_iter() is prevented from interfering by the DIO
+ * counter.
+ */
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += wreq->transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+ wreq->iocb = VFS_PTR_POISON;
+ }
+
+ netfs_clear_subrequests(wreq, false);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+{
+ if (!work_pending(&wreq->work)) {
+ netfs_get_request(wreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &wreq->work))
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * @_op: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the library that a contributory write I/O operation has
+ * terminated, one way or another, and that it should collect the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred or a
+ * negative error code. The library will look after reissuing I/O operations
+ * as appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ *
+ * When this is called, ownership of the subrequest is transferred back to the
+ * library, along with a ref.
+ *
+ * Note that %_op is a void* so that the function can be passed to
+ * kiocb::term_func without the need for a casting wrapper.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ if (subreq->error == -EAGAIN)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ else
+ set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
+
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ break;
+ default:
+ break;
+ }
+ trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
+ set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+ } else {
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (subreq->transferred < subreq->len)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * transferring a ref to it if we were the ones to do so.
+ */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+ netfs_wake_write_collector(wreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
new file mode 100644
index 000000000000..e190043bc0da
--- /dev/null
+++ b/fs/netfs/write_issue.c
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level (buffered) writeback.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ *
+ * To support network filesystems with local caching, we manage a situation
+ * that can be envisioned like the following:
+ *
+ * +---+---+-----+-----+---+----------+
+ * Folios: | | | | | | |
+ * +---+---+-----+-----+---+----------+
+ *
+ * +------+------+ +----+----+
+ * Upload: | | |.....| | |
+ * (Stream 0) +------+------+ +----+----+
+ *
+ * +------+------+------+------+------+
+ * Cache: | | | | | |
+ * (Stream 1) +------+------+------+------+------+
+ *
+ * Where we have a sequence of folios of varying sizes that we need to overlay
+ * with multiple parallel streams of I/O requests, where the I/O requests in a
+ * stream may also be of various sizes (in cifs, for example, the sizes are
+ * negotiated with the server; in something like ceph, they may represent the
+ * sizes of storage objects).
+ *
+ * The sequence in each stream may contain gaps and noncontiguous subrequests
+ * may be glued together into single vectored write RPCs.
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+
+/*
+ * Kill all dirty folios in the event of an unrecoverable error, starting with
+ * a locked folio we've already obtained from writeback_iter().
+ */
+static void netfs_kill_dirty_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ int error = 0;
+
+ do {
+ enum netfs_folio_trace why = netfs_folio_trace_kill;
+ struct netfs_group *group = NULL;
+ struct netfs_folio *finfo = NULL;
+ void *priv;
+
+ priv = folio_detach_private(folio);
+ if (priv) {
+ finfo = __netfs_folio_info(priv);
+ if (finfo) {
+ /* Kill folio from streaming write. */
+ group = finfo->netfs_group;
+ why = netfs_folio_trace_kill_s;
+ } else {
+ group = priv;
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ /* Kill copy-to-cache folio */
+ why = netfs_folio_trace_kill_cc;
+ group = NULL;
+ } else {
+ /* Kill folio with group */
+ why = netfs_folio_trace_kill_g;
+ }
+ }
+ }
+
+ trace_netfs_folio(folio, why);
+
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
+
+ netfs_put_group(group);
+ kfree(finfo);
+
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+}
+
+/*
+ * Create a write request and set it up appropriately for the origin type.
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_inode *ictx;
+
+ wreq = netfs_alloc_request(mapping, file, start, 0, origin);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ _enter("R=%x", wreq->debug_id);
+
+ ictx = netfs_inode(wreq->inode);
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+
+ wreq->contiguity = wreq->start;
+ wreq->cleaned_to = wreq->start;
+ INIT_WORK(&wreq->work, netfs_write_collection_worker);
+
+ wreq->io_streams[0].stream_nr = 0;
+ wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
+ wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
+ wreq->io_streams[0].issue_write = ictx->ops->issue_write;
+ wreq->io_streams[0].collected_to = start;
+ wreq->io_streams[0].transferred = LONG_MAX;
+
+ wreq->io_streams[1].stream_nr = 1;
+ wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
+ wreq->io_streams[1].collected_to = start;
+ wreq->io_streams[1].transferred = LONG_MAX;
+ if (fscache_resources_valid(&wreq->cache_resources)) {
+ wreq->io_streams[1].avail = true;
+ wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
+ wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
+ }
+
+ return wreq;
+}
+
+/**
+ * netfs_prepare_write_failed - Note write preparation failed
+ * @subreq: The subrequest to mark
+ *
+ * Mark a subrequest to note that preparation for write failed.
+ */
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
+}
+EXPORT_SYMBOL(netfs_prepare_write_failed);
+
+/*
+ * Prepare a write subrequest. We need to allocate a new subrequest
+ * if we don't have one.
+ */
+static void netfs_prepare_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = stream->source;
+ subreq->start = start;
+ subreq->max_len = ULONG_MAX;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->stream_nr = stream->stream_nr;
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = wreq->wsize;
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (stream->prepare_write)
+ stream->prepare_write(subreq);
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* We add to the end of the list whilst the collector may be walking
+ * the list. The collector only goes nextwards and uses the lock to
+ * remove entries off of the front.
+ */
+ spin_lock(&wreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Write list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+
+ stream->construct = subreq;
+}
+
+/*
+ * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
+ * operation. The operation may be asynchronous and should call
+ * netfs_write_subrequest_terminated() when complete.
+ */
+static void netfs_do_issue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+
+ _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+
+ // TODO: Use encrypted buffer
+ if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter,
+ subreq->start + subreq->transferred - wreq->start);
+ iov_iter_truncate(&subreq->io_iter,
+ subreq->len - subreq->transferred);
+ } else {
+ iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ stream->issue_write(subreq);
+}
+
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_do_issue_write(stream, subreq);
+}
+
+static void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+
+ if (!subreq)
+ return;
+ stream->construct = NULL;
+
+ if (subreq->start + subreq->len > wreq->start + wreq->submitted)
+ wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+ netfs_do_issue_write(stream, subreq);
+}
+
+/*
+ * Add data to the write subrequest, dispatching each as we fill it up or if it
+ * is discontiguous with the previous. We only fill one part at a time so that
+ * we can avoid overrunning the credits obtained (cifs) and try to parallelise
+ * content-crypto preparation with network writes.
+ */
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+ size_t part;
+
+ if (!stream->avail) {
+ _leave("no write");
+ return len;
+ }
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+
+ if (subreq && start != subreq->start + subreq->len) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ if (!stream->construct)
+ netfs_prepare_write(wreq, stream, start);
+ subreq = stream->construct;
+
+ part = min(subreq->max_len - subreq->len, len);
+ _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ subreq->len += part;
+ subreq->nr_segs++;
+
+ if (subreq->len >= subreq->max_len ||
+ subreq->nr_segs >= subreq->max_nr_segs ||
+ to_eof) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ return part;
+}
+
+/*
+ * Write some of a pending folio data back to the server.
+ */
+static int netfs_write_folio(struct netfs_io_request *wreq,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ struct netfs_io_stream *stream;
+ struct netfs_group *fgroup; /* TODO: Use this with ceph */
+ struct netfs_folio *finfo;
+ size_t fsize = folio_size(folio), flen = fsize, foff = 0;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false, streamw = false;
+ bool debug = false;
+
+ _enter("");
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(wreq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ return 0;
+ }
+
+ if (fpos + fsize > wreq->i_size)
+ wreq->i_size = i_size;
+
+ fgroup = netfs_folio_group(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ foff = finfo->dirty_offset;
+ flen = foff + finfo->dirty_len;
+ streamw = true;
+ }
+
+ if (wreq->origin == NETFS_WRITETHROUGH) {
+ to_eof = false;
+ if (flen > i_size - fpos)
+ flen = i_size - fpos;
+ } else if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ if (!streamw)
+ folio_zero_segment(folio, flen, fsize);
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+ flen -= foff;
+
+ _debug("folio %zx %zx %zx", foff, flen, fsize);
+
+ /* Deal with discontinuities in the stream of dirty pages. These can
+ * arise from a number of sources:
+ *
+ * (1) Intervening non-dirty pages from random-access writes, multiple
+ * flushers writing back different parts simultaneously and manual
+ * syncing.
+ *
+ * (2) Partially-written pages from write-streaming.
+ *
+ * (3) Pages that belong to a different write-back group (eg. Ceph
+ * snapshots).
+ *
+ * (4) Actually-clean pages that were marked for write to the cache
+ * when they were read. Note that these appear as a special
+ * write-back group.
+ */
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ netfs_issue_write(wreq, upload);
+ } else if (fgroup != wreq->group) {
+ /* We can't write this page to the server yet. */
+ kdebug("wrong group");
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ netfs_issue_write(wreq, upload);
+ netfs_issue_write(wreq, cache);
+ return 0;
+ }
+
+ if (foff > 0)
+ netfs_issue_write(wreq, upload);
+ if (streamw)
+ netfs_issue_write(wreq, cache);
+
+ /* Flip the page to the writeback state and unlock. If we're called
+ * from write-through, then the page has already been put into the wb
+ * state.
+ */
+ if (wreq->origin == NETFS_WRITEBACK)
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ if (!fscache_resources_valid(&wreq->cache_resources)) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+ netfs_issue_write(wreq, upload);
+ netfs_folio_written_back(folio);
+ return 0;
+ }
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+ } else if (!upload->construct) {
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+ }
+
+ /* Move the submission point forward to allow for write-streaming data
+ * not starting at the front of the page. We don't do write-streaming
+ * with the cache as the cache requires DIO alignment.
+ *
+ * Also skip uploading for data that's been read and just needs copying
+ * to the cache.
+ */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ stream->submit_max_len = fsize;
+ stream->submit_off = foff;
+ stream->submit_len = flen;
+ if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+ (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
+ stream->submit_off = UINT_MAX;
+ stream->submit_len = 0;
+ stream->submit_max_len = 0;
+ }
+ }
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ for (;;) {
+ ssize_t part;
+ size_t lowest_off = ULONG_MAX;
+ int choose_s = -1;
+
+ /* Always add to the lowest-submitted stream first. */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->submit_len > 0 &&
+ stream->submit_off < lowest_off) {
+ lowest_off = stream->submit_off;
+ choose_s = s;
+ }
+ }
+
+ if (choose_s < 0)
+ break;
+ stream = &wreq->io_streams[choose_s];
+
+ part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+ stream->submit_len, to_eof);
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_off += part;
+ stream->submit_max_len -= part;
+ if (part > stream->submit_len)
+ stream->submit_len = 0;
+ else
+ stream->submit_len -= part;
+ if (part > 0)
+ debug = true;
+ }
+
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (!debug)
+ kdebug("R=%x: No submit", wreq->debug_id);
+
+ if (flen < fsize)
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_inode *ictx = netfs_inode(mapping->host);
+ struct netfs_io_request *wreq = NULL;
+ struct folio *folio;
+ int error = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ mutex_lock(&ictx->wb_lock);
+ else if (!mutex_trylock(&ictx->wb_lock))
+ return 0;
+
+ /* Need the first folio to be able to set up the op. */
+ folio = writeback_iter(mapping, wbc, NULL, &error);
+ if (!folio)
+ goto out;
+
+ wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ error = PTR_ERR(wreq);
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_writeback);
+ netfs_stat(&netfs_n_wh_writepages);
+
+ do {
+ _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+
+ /* It appears we don't have to handle cyclic writeback wrapping. */
+ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
+
+ if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
+ unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
+ set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ wreq->netfs_ops->begin_writeback(wreq);
+ }
+
+ error = netfs_write_folio(wreq, wbc, folio);
+ if (error < 0)
+ break;
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", error);
+ return error;
+
+couldnt_start:
+ netfs_kill_dirty_pages(mapping, wbc, folio);
+out:
+ mutex_unlock(&ictx->wb_lock);
+ _leave(" = %d", error);
+ return error;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
+
+ mutex_lock(&ictx->wb_lock);
+
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq)) {
+ mutex_unlock(&ictx->wb_lock);
+ return wreq;
+ }
+
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+ return wreq;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache)
+{
+ _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
+
+ if (!*writethrough_cache) {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+
+ /* We can make multiple writes to the folio... */
+ folio_start_writeback(folio);
+ if (wreq->len == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ *writethrough_cache = folio;
+ }
+
+ wreq->len += copied;
+ if (!to_page_end)
+ return 0;
+
+ *writethrough_cache = NULL;
+ return netfs_write_folio(wreq, wbc, folio);
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ int ret;
+
+ _enter("R=%x", wreq->debug_id);
+
+ if (writethrough_cache)
+ netfs_write_folio(wreq, wbc, writethrough_cache);
+
+ netfs_issue_write(wreq, &wreq->io_streams[0]);
+ netfs_issue_write(wreq, &wreq->io_streams[1]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ ret = wreq->error;
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.
+ */
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ ssize_t part;
+ loff_t start = wreq->start;
+ int error = 0;
+
+ _enter("%zx", len);
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ while (len) {
+ // TODO: Prepare content encryption
+
+ _debug("unbuffered %zx", len);
+ part = netfs_advance_write(wreq, upload, start, len, false);
+ start += part;
+ len -= part;
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
+ }
+ if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
+ break;
+ }
+
+ netfs_issue_write(wreq, upload);
+
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+ if (list_empty(&upload->subrequests))
+ netfs_wake_write_collector(wreq, false);
+
+ _leave(" = %d", error);
+ return error;
+}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 407c6e15afe2..6bd127e6683d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -433,7 +433,7 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
return;
/* Cancel any unstarted writes on this page */
nfs_wb_folio_cancel(inode, folio);
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio);
}
@@ -500,7 +500,7 @@ static int nfs_launder_folio(struct folio *folio)
dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
inode->i_ino, folio_pos(folio));
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
ret = nfs_wb_folio(inode, folio);
trace_nfs_launder_folio_done(inode, folio, ret);
return ret;
@@ -593,8 +593,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
/* make sure the cache has finished storing the page */
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0) {
+ if (folio_test_private_2(folio) && /* [DEPRECATED] */
+ folio_wait_private_2_killable(folio) < 0) {
ret = VM_FAULT_RETRY;
goto out;
}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index e3cb4923316b..fbed0027996f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -81,6 +81,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
@@ -101,10 +103,10 @@ extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- if (folio_test_fscache(folio)) {
+ if (folio_test_private_2(folio)) { /* [DEPRECATED] */
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio);
}
fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
return true;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c709c296ea9a..acef52ecb1bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2429,7 +2429,12 @@ static int nfs_net_init(struct net *net)
struct nfs_net *nn = net_generic(net, nfs_net_id);
nfs_clients_init(net);
- rpc_proc_register(net, &nn->rpcstats);
+
+ if (!rpc_proc_register(net, &nn->rpcstats)) {
+ nfs_clients_exit(net);
+ return -ENOMEM;
+ }
+
return nfs_fs_proc_net_init(net);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5de85d725fb9..2329cbb0e446 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2120,10 +2120,10 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
if (folio_test_private(src))
return -EBUSY;
- if (folio_test_fscache(src)) {
+ if (folio_test_private_2(src)) { /* [DEPRECATED] */
if (mode == MIGRATE_ASYNC)
return -EBUSY;
- folio_wait_fscache(src);
+ folio_wait_private_2(src);
}
return migrate_folio(mapping, dst, src, mode);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 87c9547989f6..e88aca0c6e8e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -983,15 +983,7 @@ static struct workqueue_struct *callback_wq;
static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
{
trace_nfsd_cb_queue(cb->cb_clp, cb);
- return queue_delayed_work(callback_wq, &cb->cb_work, 0);
-}
-
-static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
- unsigned long msecs)
-{
- trace_nfsd_cb_queue(cb->cb_clp, cb);
- queue_delayed_work(callback_wq, &cb->cb_work,
- msecs_to_jiffies(msecs));
+ return queue_work(callback_wq, &cb->cb_work);
}
static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -1490,7 +1482,7 @@ static void
nfsd4_run_cb_work(struct work_struct *work)
{
struct nfsd4_callback *cb =
- container_of(work, struct nfsd4_callback, cb_work.work);
+ container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
int flags;
@@ -1502,16 +1494,8 @@ nfsd4_run_cb_work(struct work_struct *work)
clnt = clp->cl_cb_client;
if (!clnt) {
- if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
- nfsd41_destroy_cb(cb);
- else {
- /*
- * XXX: Ideally, we could wait for the client to
- * reconnect, but I haven't figured out how
- * to do that yet.
- */
- nfsd4_queue_cb_delayed(cb, 25);
- }
+ /* Callback channel broken, or client killed; give up: */
+ nfsd41_destroy_cb(cb);
return;
}
@@ -1544,7 +1528,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
cb->cb_ops = ops;
- INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
+ INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
cb->cb_status = 0;
cb->cb_need_restart = false;
cb->cb_holds_slot = false;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fac938f563ad..a644460f3a5e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
struct dentry *dentry, const u32 *bmval,
int ignore_crossmnt)
{
+ DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
struct nfsd4_fattr_args args;
struct svc_fh *tempfh = NULL;
int starting_len = xdr->buf->len;
__be32 *attrlen_p, status;
int attrlen_offset;
+ u32 attrmask[3];
int err;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
@@ -3502,10 +3504,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
- union {
- u32 attrmask[3];
- unsigned long mask[2];
- } u;
unsigned long bit;
bool file_modified = false;
u64 size = 0;
@@ -3517,24 +3515,24 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
args.exp = exp;
args.dentry = dentry;
args.ignore_crossmnt = (ignore_crossmnt != 0);
+ args.acl = NULL;
/*
* Make a local copy of the attribute bitmap that can be modified.
*/
- memset(&u, 0, sizeof(u));
- u.attrmask[0] = bmval[0];
- u.attrmask[1] = bmval[1];
- u.attrmask[2] = bmval[2];
+ attrmask[0] = bmval[0];
+ attrmask[1] = bmval[1];
+ attrmask[2] = bmval[2];
args.rdattr_err = 0;
if (exp->ex_fslocs.migrated) {
- status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
- &u.attrmask[2], &args.rdattr_err);
+ status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1],
+ &attrmask[2], &args.rdattr_err);
if (status)
goto out;
}
args.size = 0;
- if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+ if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
&file_modified, &size);
if (status)
@@ -3553,16 +3551,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
if (!(args.stat.result_mask & STATX_BTIME))
/* underlying FS does not offer btime so we can't share it */
- u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
- if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
- (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+ (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
err = vfs_statfs(&path, &args.statfs);
if (err)
goto out_nfserr;
}
- if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+ if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
!fhp) {
tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
status = nfserr_jukebox;
@@ -3576,11 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
} else
args.fhp = fhp;
- args.acl = NULL;
- if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+ if (attrmask[0] & FATTR4_WORD0_ACL) {
err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
if (err == -EOPNOTSUPP)
- u.attrmask[0] &= ~FATTR4_WORD0_ACL;
+ attrmask[0] &= ~FATTR4_WORD0_ACL;
else if (err == -EINVAL) {
status = nfserr_attrnotsupp;
goto out;
@@ -3592,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
args.context = NULL;
- if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
- u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
err = security_inode_getsecctx(d_inode(dentry),
&args.context, &args.contextlen);
else
err = -EOPNOTSUPP;
args.contextsupport = (err == 0);
- if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
if (err == -EOPNOTSUPP)
- u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
else if (err)
goto out_nfserr;
}
@@ -3610,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
/* attrmask */
- status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
- u.attrmask[1], u.attrmask[2]);
+ status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
+ attrmask[2]);
if (status)
goto out;
@@ -3620,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
if (!attrlen_p)
goto out_resource;
- for_each_set_bit(bit, (const unsigned long *)&u.mask,
+ bitmap_from_arr32(attr_bitmap, attrmask,
+ ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+ for_each_set_bit(bit, attr_bitmap,
ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
if (status != nfs_ok)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 01c6f3445646..2ed0fcf879fd 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,7 +68,7 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
const struct nfsd4_callback_ops *cb_ops;
- struct delayed_work cb_work;
+ struct work_struct cb_work;
int cb_seq_status;
int cb_status;
bool cb_need_restart;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bc846b904b68..aee40db7a036 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = {
#define S_SHIFT 12
static unsigned char
-nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f1a01c191cf5..8be471ce4f19 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -60,7 +60,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
if (argv->v_nmembs == 0)
return 0;
- if (argv->v_size > PAGE_SIZE)
+ if ((size_t)argv->v_size > PAGE_SIZE)
return -EINVAL;
/*
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index cdfdf51e55d7..7bc31d69f680 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -46,3 +46,12 @@ config NTFS3_FS_POSIX_ACL
NOTE: this is linux only feature. Windows will ignore these ACLs.
If you don't know what Access Control Lists are, say N.
+
+config NTFS_FS
+ tristate "NTFS file system support"
+ select NTFS3_FS
+ select BUFFER_HEAD
+ select NLS
+ help
+ This config option is here only for backward compatibility. NTFS
+ filesystem is now handled by the NTFS3 driver.
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 5cf3d9decf64..263635199b60 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -616,4 +616,11 @@ const struct file_operations ntfs_dir_operations = {
.compat_ioctl = ntfs_compat_ioctl,
#endif
};
+
+const struct file_operations ntfs_legacy_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = ntfs_readdir,
+ .open = ntfs_file_open,
+};
// clang-format on
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 5418662c80d8..b73969e05052 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -1236,4 +1236,12 @@ const struct file_operations ntfs_file_operations = {
.fallocate = ntfs_fallocate,
.release = ntfs_file_release,
};
+
+const struct file_operations ntfs_legacy_file_operations = {
+ .llseek = generic_file_llseek,
+ .read_iter = ntfs_file_read_iter,
+ .splice_read = ntfs_file_splice_read,
+ .open = ntfs_file_open,
+ .release = ntfs_file_release,
+};
// clang-format on
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index eb7a8c9fba01..d273eda1cf45 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -440,7 +440,10 @@ end_enum:
* Usually a hard links to directories are disabled.
*/
inode->i_op = &ntfs_dir_inode_operations;
- inode->i_fop = &ntfs_dir_operations;
+ if (is_legacy_ntfs(inode->i_sb))
+ inode->i_fop = &ntfs_legacy_dir_operations;
+ else
+ inode->i_fop = &ntfs_dir_operations;
ni->i_valid = 0;
} else if (S_ISLNK(mode)) {
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
@@ -450,7 +453,10 @@ end_enum:
} else if (S_ISREG(mode)) {
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_file_inode_operations;
- inode->i_fop = &ntfs_file_operations;
+ if (is_legacy_ntfs(inode->i_sb))
+ inode->i_fop = &ntfs_legacy_file_operations;
+ else
+ inode->i_fop = &ntfs_file_operations;
inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
&ntfs_aops;
if (ino != MFT_REC_MFT)
@@ -1614,7 +1620,10 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
if (S_ISDIR(mode)) {
inode->i_op = &ntfs_dir_inode_operations;
- inode->i_fop = &ntfs_dir_operations;
+ if (is_legacy_ntfs(inode->i_sb))
+ inode->i_fop = &ntfs_legacy_dir_operations;
+ else
+ inode->i_fop = &ntfs_dir_operations;
} else if (S_ISLNK(mode)) {
inode->i_op = &ntfs_link_inode_operations;
inode->i_fop = NULL;
@@ -1623,7 +1632,10 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
inode_nohighmem(inode);
} else if (S_ISREG(mode)) {
inode->i_op = &ntfs_file_inode_operations;
- inode->i_fop = &ntfs_file_operations;
+ if (is_legacy_ntfs(inode->i_sb))
+ inode->i_fop = &ntfs_legacy_file_operations;
+ else
+ inode->i_fop = &ntfs_file_operations;
inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
&ntfs_aops;
init_rwsem(&ni->file.run_lock);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 79356fd29a14..5f4d288c6adf 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -493,6 +493,7 @@ struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
struct ntfs_fnd *fnd);
bool dir_is_empty(struct inode *dir);
extern const struct file_operations ntfs_dir_operations;
+extern const struct file_operations ntfs_legacy_dir_operations;
/* Globals from file.c */
int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -507,6 +508,7 @@ long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg);
extern const struct inode_operations ntfs_special_inode_operations;
extern const struct inode_operations ntfs_file_inode_operations;
extern const struct file_operations ntfs_file_operations;
+extern const struct file_operations ntfs_legacy_file_operations;
/* Globals from frecord.c */
void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi);
@@ -1154,4 +1156,6 @@ static inline void le64_sub_cpu(__le64 *var, u64 val)
*var = cpu_to_le64(le64_to_cpu(*var) - val);
}
+bool is_legacy_ntfs(struct super_block *sb);
+
#endif /* _LINUX_NTFS3_NTFS_FS_H */
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 9df7c20d066f..b26d95a8d327 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -408,6 +408,12 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
struct ntfs_mount_options *new_opts = fc->fs_private;
int ro_rw;
+ /* If ntfs3 is used as legacy ntfs enforce read-only mode. */
+ if (is_legacy_ntfs(sb)) {
+ fc->sb_flags |= SB_RDONLY;
+ goto out;
+ }
+
ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
errorf(fc,
@@ -427,8 +433,6 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
fc,
"ntfs3: Cannot use different iocharset when remounting!");
- sync_filesystem(sb);
-
if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
!new_opts->force) {
errorf(fc,
@@ -436,6 +440,8 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
return -EINVAL;
}
+out:
+ sync_filesystem(sb);
swap(sbi->options, fc->fs_private);
return 0;
@@ -1613,6 +1619,8 @@ load_root:
}
#endif
+ if (is_legacy_ntfs(sb))
+ sb->s_flags |= SB_RDONLY;
return 0;
put_inode_out:
@@ -1730,7 +1738,7 @@ static const struct fs_context_operations ntfs_context_ops = {
* This will called when mount/remount. We will first initialize
* options so that if remount we can use just that.
*/
-static int ntfs_init_fs_context(struct fs_context *fc)
+static int __ntfs_init_fs_context(struct fs_context *fc)
{
struct ntfs_mount_options *opts;
struct ntfs_sb_info *sbi;
@@ -1778,6 +1786,11 @@ free_opts:
return -ENOMEM;
}
+static int ntfs_init_fs_context(struct fs_context *fc)
+{
+ return __ntfs_init_fs_context(fc);
+}
+
static void ntfs3_kill_sb(struct super_block *sb)
{
struct ntfs_sb_info *sbi = sb->s_fs_info;
@@ -1798,6 +1811,50 @@ static struct file_system_type ntfs_fs_type = {
.kill_sb = ntfs3_kill_sb,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
+
+#if IS_ENABLED(CONFIG_NTFS_FS)
+static int ntfs_legacy_init_fs_context(struct fs_context *fc)
+{
+ int ret;
+
+ ret = __ntfs_init_fs_context(fc);
+ /* If ntfs3 is used as legacy ntfs enforce read-only mode. */
+ fc->sb_flags |= SB_RDONLY;
+ return ret;
+}
+
+static struct file_system_type ntfs_legacy_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ntfs",
+ .init_fs_context = ntfs_legacy_init_fs_context,
+ .parameters = ntfs_fs_parameters,
+ .kill_sb = ntfs3_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+};
+MODULE_ALIAS_FS("ntfs");
+
+static inline void register_as_ntfs_legacy(void)
+{
+ int err = register_filesystem(&ntfs_legacy_fs_type);
+ if (err)
+ pr_warn("ntfs3: Failed to register legacy ntfs filesystem driver: %d\n", err);
+}
+
+static inline void unregister_as_ntfs_legacy(void)
+{
+ unregister_filesystem(&ntfs_legacy_fs_type);
+}
+bool is_legacy_ntfs(struct super_block *sb)
+{
+ return sb->s_type == &ntfs_legacy_fs_type;
+}
+#else
+static inline void register_as_ntfs_legacy(void) {}
+static inline void unregister_as_ntfs_legacy(void) {}
+bool is_legacy_ntfs(struct super_block *sb) { return false; }
+#endif
+
+
// clang-format on
static int __init init_ntfs_fs(void)
@@ -1832,6 +1889,7 @@ static int __init init_ntfs_fs(void)
goto out1;
}
+ register_as_ntfs_legacy();
err = register_filesystem(&ntfs_fs_type);
if (err)
goto out;
@@ -1849,6 +1907,7 @@ static void __exit exit_ntfs_fs(void)
rcu_barrier();
kmem_cache_destroy(ntfs_inode_cachep);
unregister_filesystem(&ntfs_fs_type);
+ unregister_as_ntfs_legacy();
ntfs3_exit_bitmap();
#ifdef CONFIG_PROC_FS
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 4a0779e3ef79..a7b527ea50d3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -355,10 +355,10 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
return inode;
}
-static int openprom_remount(struct super_block *sb, int *flags, char *data)
+static int openpromfs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- *flags |= SB_NOATIME;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_NOATIME;
return 0;
}
@@ -366,7 +366,6 @@ static const struct super_operations openprom_sops = {
.alloc_inode = openprom_alloc_inode,
.free_inode = openprom_free_inode,
.statfs = simple_statfs,
- .remount_fs = openprom_remount,
};
static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
@@ -415,6 +414,7 @@ static int openpromfs_get_tree(struct fs_context *fc)
static const struct fs_context_operations openpromfs_context_ops = {
.get_tree = openpromfs_get_tree,
+ .reconfigure = openpromfs_reconfigure,
};
static int openpromfs_init_fs_context(struct fs_context *fc)
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 8bbe9486e3a6..395a00ed8ac7 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -33,9 +33,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
- strncpy(new_op->upcall.req.lookup.d_name,
- dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name);
gossip_debug(GOSSIP_DCACHE_DEBUG,
"%s:%s:%d interrupt flag [%d]\n",
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index c9dfd5c6a097..200558ec72f0 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -41,8 +41,7 @@ static int orangefs_create(struct mnt_idmap *idmap,
fill_default_sys_attrs(new_op->upcall.req.create.attributes,
ORANGEFS_TYPE_METAFILE, mode);
- strncpy(new_op->upcall.req.create.d_name,
- dentry->d_name.name, ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.create.d_name, dentry->d_name.name);
ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
@@ -137,8 +136,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
&parent->refn.khandle);
new_op->upcall.req.lookup.parent_refn = parent->refn;
- strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name);
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: doing lookup on %s under %pU,%d\n",
@@ -192,8 +190,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
return -ENOMEM;
new_op->upcall.req.remove.parent_refn = parent->refn;
- strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.remove.d_name, dentry->d_name.name);
ret = service_operation(new_op, "orangefs_unlink",
get_interruptible_flag(inode));
@@ -247,10 +244,8 @@ static int orangefs_symlink(struct mnt_idmap *idmap,
ORANGEFS_TYPE_SYMLINK,
mode);
- strncpy(new_op->upcall.req.sym.entry_name,
- dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
- strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.sym.entry_name, dentry->d_name.name);
+ strscpy(new_op->upcall.req.sym.target, symname);
ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
@@ -324,8 +319,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
ORANGEFS_TYPE_DIRECTORY, mode);
- strncpy(new_op->upcall.req.mkdir.d_name,
- dentry->d_name.name, ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.mkdir.d_name, dentry->d_name.name);
ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
@@ -405,12 +399,8 @@ static int orangefs_rename(struct mnt_idmap *idmap,
new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
- strncpy(new_op->upcall.req.rename.d_old_name,
- old_dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
- strncpy(new_op->upcall.req.rename.d_new_name,
- new_dentry->d_name.name,
- ORANGEFS_NAME_MAX - 1);
+ strscpy(new_op->upcall.req.rename.d_old_name, old_dentry->d_name.name);
+ strscpy(new_op->upcall.req.rename.d_new_name, new_dentry->d_name.name);
ret = service_operation(new_op,
"orangefs_rename",
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 34849b4a3243..fb4d09c2f531 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -253,9 +253,8 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
if (!new_op)
return -ENOMEM;
- strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
- orangefs_sb->devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN);
+ strscpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+ orangefs_sb->devname);
gossip_debug(GOSSIP_SUPER_DEBUG,
"Attempting ORANGEFS Remount via host %s\n",
@@ -400,8 +399,7 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
return -ENOMEM;
op->upcall.req.fs_umount.id = id;
op->upcall.req.fs_umount.fs_id = fs_id;
- strncpy(op->upcall.req.fs_umount.orangefs_config_server,
- devname, ORANGEFS_MAX_SERVER_ADDR_LEN - 1);
+ strscpy(op->upcall.req.fs_umount.orangefs_config_server, devname);
r = service_operation(op, "orangefs_fs_umount", 0);
/* Not much to do about an error here. */
if (r)
@@ -494,9 +492,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (!new_op)
return ERR_PTR(-ENOMEM);
- strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
- devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN - 1);
+ strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname);
gossip_debug(GOSSIP_SUPER_DEBUG,
"Attempting ORANGEFS Mount via host %s\n",
@@ -543,9 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
* on successful mount, store the devname and data
* used
*/
- strncpy(ORANGEFS_SB(sb)->devname,
- devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN - 1);
+ strscpy(ORANGEFS_SB(sb)->devname, devname);
+
/* mount_pending must be cleared */
ORANGEFS_SB(sb)->mount_pending = 0;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 36dcc530ac28..4860fcc4611b 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -139,10 +139,6 @@ static int ovl_verity_mode_def(void)
return OVL_VERITY_OFF;
}
-#define fsparam_string_empty(NAME, OPT) \
- __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
-
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
fsparam_string("lowerdir+", Opt_lowerdir_add),
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 902b326e1e56..87dcaae32ff8 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
}
- if (ret >= 0 && boot_command_line[0]) {
- ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
- boot_command_line);
- if (ret > 0)
- dst += ret;
- }
+ }
+ if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
}
out:
kfree(key);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e72e5ad42bc..f4b1c8b42a51 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -74,7 +74,18 @@ out:
return 0;
}
-static int proc_fdinfo_access_allowed(struct inode *inode)
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, seq_show, inode);
+}
+
+/**
+ * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
+ * that the current task has PTRACE_MODE_READ in addition to the normal
+ * POSIX-like checks.
+ */
+static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -88,18 +99,13 @@ static int proc_fdinfo_access_allowed(struct inode *inode)
if (!allowed)
return -EACCES;
- return 0;
+ return generic_permission(idmap, inode, mask);
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
-{
- int ret = proc_fdinfo_access_allowed(inode);
-
- if (ret)
- return ret;
-
- return single_open(file, seq_show, inode);
-}
+static const struct inode_operations proc_fdinfo_file_inode_operations = {
+ .permission = proc_fdinfo_permission,
+ .setattr = proc_setattr,
+};
static const struct file_operations proc_fdinfo_file_operations = {
.open = seq_fdinfo_open,
@@ -388,6 +394,8 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
ei = PROC_I(inode);
ei->fd = data->fd;
+ inode->i_op = &proc_fdinfo_file_inode_operations;
+
inode->i_fop = &proc_fdinfo_file_operations;
tid_fd_update_inode(task, inode, 0);
@@ -407,23 +415,13 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
-static int proc_open_fdinfo(struct inode *inode, struct file *file)
-{
- int ret = proc_fdinfo_access_allowed(inode);
-
- if (ret)
- return ret;
-
- return 0;
-}
-
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
+ .permission = proc_fdinfo_permission,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
- .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 195b077c0fac..9223856c934b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -67,7 +67,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
*/
ppage = pfn_to_online_page(pfn);
- if (!ppage || PageSlab(ppage) || page_has_type(ppage))
+ if (!ppage)
pcount = 0;
else
pcount = page_mapcount(ppage);
@@ -124,11 +124,8 @@ u64 stable_page_flags(struct page *page)
/*
* pseudo flags for the well known (anonymous) memory mapped pages
- *
- * Note that page->_mapcount is overloaded in SLAB, so the
- * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 2ba31b6d68c0..52f0b75cbce2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -135,6 +135,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
* @write: The write method with which to 'modify' the file.
+ * @state_size: The size of the per-file private state to allocate.
* @data: Data for retrieval by pde_data().
*
* Create a network namespaced proc file in the @parent directory with the
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 23fbab954c20..102f48668c35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1817,15 +1817,13 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
}
static void make_uffd_wp_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
+ unsigned long addr, pte_t *pte, pte_t ptent)
{
- pte_t ptent = ptep_get(pte);
-
if (pte_present(ptent)) {
pte_t old_pte;
old_pte = ptep_modify_prot_start(vma, addr, pte);
- ptent = pte_mkuffd_wp(ptent);
+ ptent = pte_mkuffd_wp(old_pte);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_mkuffd_wp(ptent);
@@ -2175,9 +2173,12 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
/* Fast path for performing exclusive WP */
for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
- if (pte_uffd_wp(ptep_get(pte)))
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
continue;
- make_uffd_wp_pte(vma, addr, pte);
+ make_uffd_wp_pte(vma, addr, pte, ptent);
if (!flush_end)
start = addr;
flush_end = addr + PAGE_SIZE;
@@ -2190,8 +2191,10 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
p->arg.return_mask == PAGE_IS_WRITTEN) {
for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
unsigned long next = addr + PAGE_SIZE;
+ pte_t ptent = ptep_get(pte);
- if (pte_uffd_wp(ptep_get(pte)))
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
continue;
ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
p, addr, &next);
@@ -2199,7 +2202,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
break;
if (~p->arg.flags & PM_SCAN_WP_MATCHING)
continue;
- make_uffd_wp_pte(vma, addr, pte);
+ make_uffd_wp_pte(vma, addr, pte, ptent);
if (!flush_end)
start = addr;
flush_end = next;
@@ -2208,8 +2211,9 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
}
for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
unsigned long categories = p->cur_vma_category |
- pagemap_page_category(p, vma, addr, ptep_get(pte));
+ pagemap_page_category(p, vma, addr, ptent);
unsigned long next = addr + PAGE_SIZE;
if (!pagemap_scan_is_interesting_page(categories, p))
@@ -2224,7 +2228,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
if (~categories & PAGE_IS_WRITTEN)
continue;
- make_uffd_wp_pte(vma, addr, pte);
+ make_uffd_wp_pte(vma, addr, pte, ptent);
if (!flush_end)
start = addr;
flush_end = next;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 405913f4faff..d62fbef838b6 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -19,11 +19,11 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
-#include <linux/mount.h>
#include <linux/crc32.h>
#include <linux/mpage.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include "qnx6.h"
static const struct super_operations qnx6_sops;
@@ -31,7 +31,7 @@ static const struct super_operations qnx6_sops;
static void qnx6_put_super(struct super_block *sb);
static struct inode *qnx6_alloc_inode(struct super_block *sb);
static void qnx6_free_inode(struct inode *inode);
-static int qnx6_remount(struct super_block *sb, int *flags, char *data);
+static int qnx6_reconfigure(struct fs_context *fc);
static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf);
static int qnx6_show_options(struct seq_file *seq, struct dentry *root);
@@ -40,7 +40,6 @@ static const struct super_operations qnx6_sops = {
.free_inode = qnx6_free_inode,
.put_super = qnx6_put_super,
.statfs = qnx6_statfs,
- .remount_fs = qnx6_remount,
.show_options = qnx6_show_options,
};
@@ -54,10 +53,12 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-static int qnx6_remount(struct super_block *sb, int *flags, char *data)
+static int qnx6_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
@@ -218,39 +219,36 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
#endif
enum {
- Opt_mmifs,
- Opt_err
+ Opt_mmifs
+};
+
+struct qnx6_context {
+ unsigned long s_mount_opts;
};
-static const match_table_t tokens = {
- {Opt_mmifs, "mmi_fs"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec qnx6_param_spec[] = {
+ fsparam_flag ("mmi_fs", Opt_mmifs),
+ {}
};
-static int qnx6_parse_options(char *options, struct super_block *sb)
+static int qnx6_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- struct qnx6_sb_info *sbi = QNX6_SB(sb);
- substring_t args[MAX_OPT_ARGS];
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_mmifs:
- set_opt(sbi->s_mount_opt, MMI_FS);
- break;
- default:
- return 0;
- }
+ struct qnx6_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, qnx6_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_mmifs:
+ ctx->s_mount_opts |= QNX6_MOUNT_MMI_FS;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
@@ -293,22 +291,25 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
static struct inode *qnx6_private_inode(struct super_block *s,
struct qnx6_root_node *p);
-static int qnx6_fill_super(struct super_block *s, void *data, int silent)
+static int qnx6_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh1 = NULL, *bh2 = NULL;
struct qnx6_super_block *sb1 = NULL, *sb2 = NULL;
struct qnx6_sb_info *sbi;
+ struct qnx6_context *ctx = fc->fs_private;
struct inode *root;
const char *errmsg;
struct qnx6_sb_info *qs;
int ret = -EINVAL;
u64 offset;
int bootblock_offset = QNX6_BOOTBLOCK_SIZE;
+ int silent = fc->sb_flags & SB_SILENT;
qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL);
if (!qs)
return -ENOMEM;
s->s_fs_info = qs;
+ qs->s_mount_opt = ctx->s_mount_opts;
/* Superblock always is 512 Byte long */
if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
@@ -316,12 +317,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
goto outnobh;
}
- /* parse the mount-options */
- if (!qnx6_parse_options((char *) data, s)) {
- pr_err("invalid mount options.\n");
- goto outnobh;
- }
- if (test_opt(s, MMI_FS)) {
+ if (qs->s_mount_opt == QNX6_MOUNT_MMI_FS) {
sb1 = qnx6_mmi_fill_super(s, silent);
if (sb1)
goto mmi_success;
@@ -632,18 +628,43 @@ static void destroy_inodecache(void)
kmem_cache_destroy(qnx6_inode_cachep);
}
-static struct dentry *qnx6_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int qnx6_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, qnx6_fill_super);
+}
+
+static void qnx6_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, qnx6_fill_super);
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations qnx6_context_ops = {
+ .parse_param = qnx6_parse_param,
+ .get_tree = qnx6_get_tree,
+ .reconfigure = qnx6_reconfigure,
+ .free = qnx6_free_fc,
+};
+
+static int qnx6_init_fs_context(struct fs_context *fc)
+{
+ struct qnx6_context *ctx;
+
+ ctx = kzalloc(sizeof(struct qnx6_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fc->ops = &qnx6_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
}
static struct file_system_type qnx6_fs_type = {
- .owner = THIS_MODULE,
- .name = "qnx6",
- .mount = qnx6_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "qnx6",
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = qnx6_init_fs_context,
+ .parameters = qnx6_param_spec,
};
MODULE_ALIAS_FS("qnx6");
diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..2115d1f40bd5 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1685,7 +1685,7 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if ((iocb->ki_flags & IOCB_NOWAIT) &&
!((iocb->ki_flags & IOCB_DIRECT) ||
- (file->f_mode & FMODE_BUF_WASYNC)))
+ (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index f5fdaf3b1572..e676c8b0cf5d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -669,18 +669,11 @@ void seq_putc(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_putc);
-void seq_puts(struct seq_file *m, const char *s)
+void __seq_puts(struct seq_file *m, const char *s)
{
- int len = strlen(s);
-
- if (m->count + len >= m->size) {
- seq_set_overflow(m);
- return;
- }
- memcpy(m->buf + m->count, s, len);
- m->count += len;
+ seq_write(m, s, strlen(s));
}
-EXPORT_SYMBOL(seq_puts);
+EXPORT_SYMBOL(__seq_puts);
/**
* seq_put_decimal_ull_width - A helper routine for putting decimal numbers
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 2927bd174a88..2517dc242386 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -2,6 +2,7 @@
config CIFS
tristate "SMB3 and CIFS support (advanced network filesystem)"
depends on INET
+ select NETFS_SUPPORT
select NLS
select NLS_UCS2_UTILS
select CRYPTO
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 13a9d7acf8f8..0ff2491c311d 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -433,8 +433,8 @@ smb2_close_cached_fid(struct kref *ref)
if (cfid->is_open) {
rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
cfid->fid.volatile_fid);
- if (rc != -EBUSY && rc != -EAGAIN)
- atomic_dec(&cfid->tcon->num_remote_opens);
+ if (rc) /* should we retry on -EBUSY or -EAGAIN? */
+ cifs_dbg(VFS, "close cached dir rc %d\n", rc);
}
free_cached_dir(cfid);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index d41eedbff674..6e1698614745 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -371,9 +371,13 @@ static struct kmem_cache *cifs_inode_cachep;
static struct kmem_cache *cifs_req_cachep;
static struct kmem_cache *cifs_mid_cachep;
static struct kmem_cache *cifs_sm_req_cachep;
+static struct kmem_cache *cifs_io_request_cachep;
+static struct kmem_cache *cifs_io_subrequest_cachep;
mempool_t *cifs_sm_req_poolp;
mempool_t *cifs_req_poolp;
mempool_t *cifs_mid_poolp;
+mempool_t cifs_io_request_pool;
+mempool_t cifs_io_subrequest_pool;
static struct inode *
cifs_alloc_inode(struct super_block *sb)
@@ -389,6 +393,7 @@ cifs_alloc_inode(struct super_block *sb)
* server, can not assume caching of file data or metadata.
*/
cifs_set_oplock_level(cifs_inode, 0);
+ cifs_inode->lease_granted = false;
cifs_inode->flags = 0;
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
@@ -739,6 +744,8 @@ static void cifs_umount_begin(struct super_block *sb)
spin_lock(&cifs_tcp_ses_lock);
spin_lock(&tcon->tc_lock);
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_see_umount);
if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to umount this and woken up
@@ -983,61 +990,6 @@ out:
return root;
}
-
-static ssize_t
-cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- ssize_t rc;
- struct inode *inode = file_inode(iocb->ki_filp);
-
- if (iocb->ki_flags & IOCB_DIRECT)
- return cifs_user_readv(iocb, iter);
-
- rc = cifs_revalidate_mapping(inode);
- if (rc)
- return rc;
-
- return generic_file_read_iter(iocb, iter);
-}
-
-static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct cifsInodeInfo *cinode = CIFS_I(inode);
- ssize_t written;
- int rc;
-
- if (iocb->ki_filp->f_flags & O_DIRECT) {
- written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
- cifs_zap_mapping(inode);
- cifs_dbg(FYI,
- "Set no oplock for inode=%p after a write operation\n",
- inode);
- cinode->oplock = 0;
- }
- return written;
- }
-
- written = cifs_get_writer(cinode);
- if (written)
- return written;
-
- written = generic_file_write_iter(iocb, from);
-
- if (CIFS_CACHE_WRITE(CIFS_I(inode)))
- goto out;
-
- rc = filemap_fdatawrite(inode->i_mapping);
- if (rc)
- cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
- rc, inode);
-
-out:
- cifs_put_writer(cinode);
- return written;
-}
-
static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
{
struct cifsFileInfo *cfile = file->private_data;
@@ -1339,6 +1291,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
+ if (fend > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = fend + 1;
/* Discard all the folios that overlap the destination region. */
cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
@@ -1357,6 +1311,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
+ if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = new_size;
}
/* force revalidate of size and timestamps of target file now
@@ -1448,6 +1404,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
+ if (fend > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = fend + 1;
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
@@ -1564,8 +1522,8 @@ const struct file_operations cifs_file_strict_ops = {
};
const struct file_operations cifs_file_direct_ops = {
- .read_iter = cifs_direct_readv,
- .write_iter = cifs_direct_writev,
+ .read_iter = netfs_unbuffered_read_iter,
+ .write_iter = netfs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1620,8 +1578,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
};
const struct file_operations cifs_file_direct_nobrl_ops = {
- .read_iter = cifs_direct_readv,
- .write_iter = cifs_direct_writev,
+ .read_iter = netfs_unbuffered_read_iter,
+ .write_iter = netfs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -1796,6 +1754,48 @@ static void destroy_mids(void)
kmem_cache_destroy(cifs_mid_cachep);
}
+static int cifs_init_netfs(void)
+{
+ cifs_io_request_cachep =
+ kmem_cache_create("cifs_io_request",
+ sizeof(struct cifs_io_request), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cifs_io_request_cachep)
+ goto nomem_req;
+
+ if (mempool_init_slab_pool(&cifs_io_request_pool, 100, cifs_io_request_cachep) < 0)
+ goto nomem_reqpool;
+
+ cifs_io_subrequest_cachep =
+ kmem_cache_create("cifs_io_subrequest",
+ sizeof(struct cifs_io_subrequest), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cifs_io_subrequest_cachep)
+ goto nomem_subreq;
+
+ if (mempool_init_slab_pool(&cifs_io_subrequest_pool, 100, cifs_io_subrequest_cachep) < 0)
+ goto nomem_subreqpool;
+
+ return 0;
+
+nomem_subreqpool:
+ kmem_cache_destroy(cifs_io_subrequest_cachep);
+nomem_subreq:
+ mempool_destroy(&cifs_io_request_pool);
+nomem_reqpool:
+ kmem_cache_destroy(cifs_io_request_cachep);
+nomem_req:
+ return -ENOMEM;
+}
+
+static void cifs_destroy_netfs(void)
+{
+ mempool_destroy(&cifs_io_subrequest_pool);
+ kmem_cache_destroy(cifs_io_subrequest_cachep);
+ mempool_destroy(&cifs_io_request_pool);
+ kmem_cache_destroy(cifs_io_request_cachep);
+}
+
static int __init
init_cifs(void)
{
@@ -1900,10 +1900,14 @@ init_cifs(void)
if (rc)
goto out_destroy_deferredclose_wq;
- rc = init_mids();
+ rc = cifs_init_netfs();
if (rc)
goto out_destroy_inodecache;
+ rc = init_mids();
+ if (rc)
+ goto out_destroy_netfs;
+
rc = cifs_init_request_bufs();
if (rc)
goto out_destroy_mids;
@@ -1958,6 +1962,8 @@ out_destroy_request_bufs:
cifs_destroy_request_bufs();
out_destroy_mids:
destroy_mids();
+out_destroy_netfs:
+ cifs_destroy_netfs();
out_destroy_inodecache:
cifs_destroy_inodecache();
out_destroy_deferredclose_wq:
@@ -1996,6 +2002,7 @@ exit_cifs(void)
#endif
cifs_destroy_request_bufs();
destroy_mids();
+ cifs_destroy_netfs();
cifs_destroy_inodecache();
destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index ca55d01117c8..87310f05d397 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -69,7 +69,6 @@ extern int cifs_revalidate_file_attr(struct file *filp);
extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);
extern int cifs_revalidate_dentry(struct dentry *);
-extern int cifs_invalidate_mapping(struct inode *inode);
extern int cifs_revalidate_mapping(struct inode *inode);
extern int cifs_zap_mapping(struct inode *inode);
extern int cifs_getattr(struct mnt_idmap *, const struct path *,
@@ -85,6 +84,7 @@ extern const struct inode_operations cifs_namespace_inode_operations;
/* Functions related to files and directories */
+extern const struct netfs_request_ops cifs_req_ops;
extern const struct file_operations cifs_file_ops;
extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
@@ -94,12 +94,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
-extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
-extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
-extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter);
extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
@@ -110,9 +108,6 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
-extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len);
-extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len);
-extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len);
/* Functions related to dir entries */
extern const struct dentry_operations cifs_dentry_ops;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index f6a302205f89..65574e69ba4f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -268,8 +268,7 @@ struct dfs_info3_param;
struct cifs_fattr;
struct smb3_fs_context;
struct cifs_fid;
-struct cifs_readdata;
-struct cifs_writedata;
+struct cifs_io_subrequest;
struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
@@ -450,10 +449,9 @@ struct smb_version_operations {
/* send a flush request to the server */
int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
/* async read from the server */
- int (*async_readv)(struct cifs_readdata *);
+ int (*async_readv)(struct cifs_io_subrequest *);
/* async write to the server */
- int (*async_writev)(struct cifs_writedata *,
- void (*release)(struct kref *));
+ void (*async_writev)(struct cifs_io_subrequest *);
/* sync read from the server */
int (*sync_read)(const unsigned int, struct cifs_fid *,
struct cifs_io_parms *, unsigned int *, char **,
@@ -548,8 +546,8 @@ struct smb_version_operations {
/* writepages retry size */
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
- int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, struct cifs_credits *);
+ int (*wait_mtu_credits)(struct TCP_Server_Info *, size_t,
+ size_t *, struct cifs_credits *);
/* adjust previously taken mtu credits to request size */
int (*adjust_credits)(struct TCP_Server_Info *server,
struct cifs_credits *credits,
@@ -883,11 +881,12 @@ add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
static inline void
add_credits_and_wake_if(struct TCP_Server_Info *server,
- const struct cifs_credits *credits, const int optype)
+ struct cifs_credits *credits, const int optype)
{
if (credits->value) {
server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
+ credits->value = 0;
}
}
@@ -1077,6 +1076,7 @@ struct cifs_ses {
and after mount option parsing we fill it */
char *domainName;
char *password;
+ char *password2; /* When key rotation used, new password may be set before it expires */
char workstation_name[CIFS_MAX_WORKSTATION_LEN];
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
@@ -1189,6 +1189,7 @@ struct cifs_fattr {
*/
struct cifs_tcon {
struct list_head tcon_list;
+ int debug_id; /* Debugging for tracing */
int tc_count;
struct list_head rlist; /* reconnect list */
spinlock_t tc_lock; /* protect anything here that is not protected */
@@ -1275,7 +1276,9 @@ struct cifs_tcon {
__u32 max_cached_dirs;
#ifdef CONFIG_CIFS_FSCACHE
u64 resource_id; /* server resource id */
+ bool fscache_acquired; /* T if we've tried acquiring a cookie */
struct fscache_volume *fscache; /* cookie for share */
+ struct mutex fscache_lock; /* Prevent regetting a cookie */
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fids *cfids;
@@ -1488,50 +1491,30 @@ struct cifs_aio_ctx {
bool direct_io;
};
-/* asynchronous read support */
-struct cifs_readdata {
- struct kref refcount;
- struct list_head list;
- struct completion done;
+struct cifs_io_request {
+ struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
- struct address_space *mapping;
- struct cifs_aio_ctx *ctx;
- __u64 offset;
- ssize_t got_bytes;
- unsigned int bytes;
- pid_t pid;
- int result;
- struct work_struct work;
- struct iov_iter iter;
- struct kvec iov[2];
- struct TCP_Server_Info *server;
-#ifdef CONFIG_CIFS_SMB_DIRECT
- struct smbd_mr *mr;
-#endif
- struct cifs_credits credits;
};
-/* asynchronous write support */
-struct cifs_writedata {
- struct kref refcount;
- struct list_head list;
- struct completion done;
- enum writeback_sync_modes sync_mode;
- struct work_struct work;
- struct cifsFileInfo *cfile;
- struct cifs_aio_ctx *ctx;
- struct iov_iter iter;
- struct bio_vec *bv;
- __u64 offset;
+/* asynchronous read support */
+struct cifs_io_subrequest {
+ union {
+ struct netfs_io_subrequest subreq;
+ struct netfs_io_request *rreq;
+ struct cifs_io_request *req;
+ };
+ ssize_t got_bytes;
pid_t pid;
- unsigned int bytes;
+ unsigned int xid;
int result;
+ bool have_xid;
+ bool replay;
+ struct kvec iov[2];
struct TCP_Server_Info *server;
#ifdef CONFIG_CIFS_SMB_DIRECT
struct smbd_mr *mr;
#endif
struct cifs_credits credits;
- bool replay;
};
/*
@@ -2111,6 +2094,8 @@ extern __u32 cifs_lock_secret;
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
+extern mempool_t cifs_io_request_pool;
+extern mempool_t cifs_io_subrequest_pool;
/* Operations for different SMB versions */
#define SMB1_VERSION_STRING "1.0"
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index c0513fbb8a59..c46d418c1c0c 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -882,7 +882,7 @@ typedef struct smb_com_open_rsp {
__u8 OplockLevel;
__u16 Fid;
__le32 CreateAction;
- struct_group(common_attributes,
+ struct_group_attr(common_attributes, __packed,
__le64 CreationTime;
__le64 LastAccessTime;
__le64 LastWriteTime;
@@ -2266,7 +2266,7 @@ typedef struct {
/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */
/******************************************************************************/
typedef struct { /* data block encoding of response to level 263 QPathInfo */
- struct_group(common_attributes,
+ struct_group_attr(common_attributes, __packed,
__le64 CreationTime;
__le64 LastAccessTime;
__le64 LastWriteTime;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 8e0a348f1f66..c15bb5ee7eb7 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -121,7 +121,7 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
- unsigned int size, unsigned int *num,
+ size_t size, size_t *num,
struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
@@ -148,6 +148,8 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
bool from_readdir);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
+ bool was_async);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
int flags,
@@ -303,7 +305,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
struct TCP_Server_Info *primary_server);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
-extern void cifs_put_tcon(struct cifs_tcon *tcon);
+extern void cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace);
extern void cifs_release_automount_timer(void);
@@ -530,8 +532,9 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
extern struct cifs_ses *sesInfoAlloc(void);
extern void sesInfoFree(struct cifs_ses *);
-extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled);
-extern void tconInfoFree(struct cifs_tcon *);
+extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled,
+ enum smb3_tcon_ref_trace trace);
+extern void tconInfoFree(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace);
extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
__u32 *pexpected_response_sequence_number);
@@ -598,15 +601,11 @@ void __cifs_put_smb_ses(struct cifs_ses *ses);
extern struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx);
-void cifs_readdata_release(struct kref *refcount);
-int cifs_async_readv(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_io_subrequest *rdata);
int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
-int cifs_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref));
+void cifs_async_writev(struct cifs_io_subrequest *wdata);
void cifs_writev_complete(struct work_struct *work);
-struct cifs_writedata *cifs_writedata_alloc(work_func_t complete);
-void cifs_writedata_release(struct kref *refcount);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
@@ -721,8 +720,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
return options;
}
-struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
-void cifs_put_tcon_super(struct super_block *sb);
int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
/* Put references of @ses and its children */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 23b5709ddc31..25e9ab947c17 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -24,6 +24,8 @@
#include <linux/swap.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uaccess.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "cifspdu.h"
#include "cifsfs.h"
#include "cifsglob.h"
@@ -1262,18 +1264,17 @@ openRetry:
static void
cifs_readv_callback(struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
- .rq_iter_size = iov_iter_count(&rdata->iter),
- .rq_iter = rdata->iter };
+ .rq_iter = rdata->subreq.io_iter };
struct cifs_credits credits = { .value = 1, .instance = 0 };
- cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
- rdata->bytes);
+ rdata->subreq.len);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -1305,30 +1306,36 @@ cifs_readv_callback(struct mid_q_entry *mid)
rdata->result = -EIO;
}
- queue_work(cifsiod_wq, &rdata->work);
+ if (rdata->result == 0 || rdata->result == -EAGAIN)
+ iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
+ rdata->credits.value = 0;
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result,
+ false);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
int
-cifs_async_readv(struct cifs_readdata *rdata)
+cifs_async_readv(struct cifs_io_subrequest *rdata)
{
int rc;
READ_REQ *smb = NULL;
int wct;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2 };
- cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
- __func__, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
+ __func__, rdata->subreq.start, rdata->subreq.len);
if (tcon->ses->capabilities & CAP_LARGE_FILES)
wct = 12;
else {
wct = 10; /* old style read */
- if ((rdata->offset >> 32) > 0) {
+ if ((rdata->subreq.start >> 32) > 0) {
/* can not handle this big offset for old */
return -EIO;
}
@@ -1342,13 +1349,13 @@ cifs_async_readv(struct cifs_readdata *rdata)
smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
- smb->Fid = rdata->cfile->fid.netfid;
- smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+ smb->Fid = rdata->req->cfile->fid.netfid;
+ smb->OffsetLow = cpu_to_le32(rdata->subreq.start & 0xFFFFFFFF);
if (wct == 12)
- smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+ smb->OffsetHigh = cpu_to_le32(rdata->subreq.start >> 32);
smb->Remaining = 0;
- smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
- smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+ smb->MaxCount = cpu_to_le16(rdata->subreq.len & 0xFFFF);
+ smb->MaxCountHigh = cpu_to_le32(rdata->subreq.len >> 16);
if (wct == 12)
smb->ByteCount = 0;
else {
@@ -1364,15 +1371,11 @@ cifs_async_readv(struct cifs_readdata *rdata)
rdata->iov[1].iov_base = (char *)smb + 4;
rdata->iov[1].iov_len = get_rfc1002_length(smb);
- kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
- else
- kref_put(&rdata->refcount, cifs_readdata_release);
-
cifs_small_buf_release(smb);
return rc;
}
@@ -1615,16 +1618,17 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
static void
cifs_writev_callback(struct mid_q_entry *mid)
{
- struct cifs_writedata *wdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
- unsigned int written;
+ struct cifs_io_subrequest *wdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
struct cifs_credits credits = { .value = 1, .instance = 0 };
+ ssize_t result;
+ size_t written;
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
- if (wdata->result != 0)
+ result = cifs_check_receive(mid, tcon->ses->server, 0);
+ if (result != 0)
break;
written = le16_to_cpu(smb->CountHigh);
@@ -1636,37 +1640,37 @@ cifs_writev_callback(struct mid_q_entry *mid)
* client. OS/2 servers are known to set incorrect
* CountHigh values.
*/
- if (written > wdata->bytes)
+ if (written > wdata->subreq.len)
written &= 0xFFFF;
- if (written < wdata->bytes)
- wdata->result = -ENOSPC;
+ if (written < wdata->subreq.len)
+ result = -ENOSPC;
else
- wdata->bytes = written;
+ result = written;
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
- wdata->result = -EAGAIN;
+ result = -EAGAIN;
break;
default:
- wdata->result = -EIO;
+ result = -EIO;
break;
}
- queue_work(cifsiod_wq, &wdata->work);
+ wdata->credits.value = 0;
+ cifs_write_subrequest_terminated(wdata, result, true);
release_mid(mid);
add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
-int
-cifs_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref))
+void
+cifs_async_writev(struct cifs_io_subrequest *wdata)
{
int rc = -EACCES;
WRITE_REQ *smb = NULL;
int wct;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct kvec iov[2];
struct smb_rqst rqst = { };
@@ -1674,9 +1678,10 @@ cifs_async_writev(struct cifs_writedata *wdata,
wct = 14;
} else {
wct = 12;
- if (wdata->offset >> 32 > 0) {
+ if (wdata->subreq.start >> 32 > 0) {
/* can not handle big offset for old srv */
- return -EIO;
+ rc = -EIO;
+ goto out;
}
}
@@ -1688,10 +1693,10 @@ cifs_async_writev(struct cifs_writedata *wdata,
smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
- smb->Fid = wdata->cfile->fid.netfid;
- smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+ smb->Fid = wdata->req->cfile->fid.netfid;
+ smb->OffsetLow = cpu_to_le32(wdata->subreq.start & 0xFFFFFFFF);
if (wct == 14)
- smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+ smb->OffsetHigh = cpu_to_le32(wdata->subreq.start >> 32);
smb->Reserved = 0xFFFFFFFF;
smb->WriteMode = 0;
smb->Remaining = 0;
@@ -1707,39 +1712,40 @@ cifs_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
- rqst.rq_iter = wdata->iter;
- rqst.rq_iter_size = iov_iter_count(&wdata->iter);
+ rqst.rq_iter = wdata->subreq.io_iter;
+ rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter);
- cifs_dbg(FYI, "async write at %llu %u bytes\n",
- wdata->offset, wdata->bytes);
+ cifs_dbg(FYI, "async write at %llu %zu bytes\n",
+ wdata->subreq.start, wdata->subreq.len);
- smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
- smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+ smb->DataLengthLow = cpu_to_le16(wdata->subreq.len & 0xFFFF);
+ smb->DataLengthHigh = cpu_to_le16(wdata->subreq.len >> 16);
if (wct == 14) {
- inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
- put_bcc(wdata->bytes + 1, &smb->hdr);
+ inc_rfc1001_len(&smb->hdr, wdata->subreq.len + 1);
+ put_bcc(wdata->subreq.len + 1, &smb->hdr);
} else {
/* wct == 12 */
struct smb_com_writex_req *smbw =
(struct smb_com_writex_req *)smb;
- inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
- put_bcc(wdata->bytes + 5, &smbw->hdr);
+ inc_rfc1001_len(&smbw->hdr, wdata->subreq.len + 5);
+ put_bcc(wdata->subreq.len + 5, &smbw->hdr);
iov[1].iov_len += 4; /* pad bigger by four bytes */
}
- kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
cifs_writev_callback, NULL, wdata, 0, NULL);
-
+ /* Can't touch wdata if rc == 0 */
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
- else
- kref_put(&wdata->refcount, release);
async_writev_out:
cifs_small_buf_release(smb);
- return rc;
+out:
+ if (rc) {
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, false);
+ }
}
int
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 85679ae106fd..7a16e12f5da8 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1943,7 +1943,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
/* no need to setup directory caching on IPC share, so pass in false */
- tcon = tcon_info_alloc(false);
+ tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc);
if (tcon == NULL)
return -ENOMEM;
@@ -1960,7 +1960,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
if (rc) {
cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
- tconInfoFree(tcon);
+ tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail);
goto out;
}
@@ -2043,7 +2043,7 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
* files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an
* SMB2 LOGOFF Request.
*/
- tconInfoFree(tcon);
+ tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc);
if (do_logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
@@ -2183,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
++delim;
+ /* BB consider adding support for password2 (Key Rotation) for multiuser in future */
ctx->password = kstrndup(delim, len, GFP_KERNEL);
if (!ctx->password) {
cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
@@ -2206,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
kfree(ctx->username);
ctx->username = NULL;
kfree_sensitive(ctx->password);
+ /* no need to free ctx->password2 since not allocated in this path */
ctx->password = NULL;
goto out_key_put;
}
@@ -2317,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
if (!ses->password)
goto get_ses_fail;
}
+ /* ctx->password freed at unmount */
+ if (ctx->password2) {
+ ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+ if (!ses->password2)
+ goto get_ses_fail;
+ }
if (ctx->domainname) {
ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL);
if (!ses->domainName)
@@ -2424,6 +2432,8 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
continue;
}
++tcon->tc_count;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_get_find);
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return tcon;
@@ -2433,7 +2443,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
void
-cifs_put_tcon(struct cifs_tcon *tcon)
+cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace)
{
unsigned int xid;
struct cifs_ses *ses;
@@ -2449,6 +2459,7 @@ cifs_put_tcon(struct cifs_tcon *tcon)
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
spin_lock(&tcon->tc_lock);
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count - 1, trace);
if (--tcon->tc_count > 0) {
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2485,7 +2496,7 @@ cifs_put_tcon(struct cifs_tcon *tcon)
_free_xid(xid);
cifs_fscache_release_super_cookie(tcon);
- tconInfoFree(tcon);
+ tconInfoFree(tcon, netfs_trace_tcon_ref_free);
cifs_put_smb_ses(ses);
}
@@ -2539,7 +2550,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
nohandlecache = ctx->nohandlecache;
else
nohandlecache = true;
- tcon = tcon_info_alloc(!nohandlecache);
+ tcon = tcon_info_alloc(!nohandlecache, netfs_trace_tcon_ref_new);
if (tcon == NULL) {
rc = -ENOMEM;
goto out_fail;
@@ -2729,7 +2740,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
return tcon;
out_fail:
- tconInfoFree(tcon);
+ tconInfoFree(tcon, netfs_trace_tcon_ref_free_fail);
return ERR_PTR(rc);
}
@@ -2746,7 +2757,7 @@ cifs_put_tlink(struct tcon_link *tlink)
}
if (!IS_ERR(tlink_tcon(tlink)))
- cifs_put_tcon(tlink_tcon(tlink));
+ cifs_put_tcon(tlink_tcon(tlink), netfs_trace_tcon_ref_put_tlink);
kfree(tlink);
}
@@ -3311,7 +3322,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx)
int rc = 0;
if (mnt_ctx->tcon)
- cifs_put_tcon(mnt_ctx->tcon);
+ cifs_put_tcon(mnt_ctx->tcon, netfs_trace_tcon_ref_put_mnt_ctx);
else if (mnt_ctx->ses)
cifs_put_smb_ses(mnt_ctx->ses);
else if (mnt_ctx->server)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9be37d0fe724..4c981ce89f8a 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -36,133 +36,323 @@
#include "fs_context.h"
#include "cifs_ioctl.h"
#include "cached_dir.h"
+#include <trace/events/netfs.h>
+
+static int cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush);
/*
- * Remove the dirty flags from a span of pages.
+ * Prepare a subrequest to upload to the server. We need to allocate credits
+ * so that we know the maximum amount of data that we can include in it.
*/
-static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct cifs_io_subrequest *wdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = wdata->req;
+ struct TCP_Server_Info *server;
+ struct cifsFileInfo *open_file = req->cfile;
+ size_t wsize = req->rreq.wsize;
+ int rc;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (!wdata->have_xid) {
+ wdata->xid = get_xid();
+ wdata->have_xid = true;
+ }
- rcu_read_lock();
+ server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
+ wdata->server = server;
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) {
- if (xas_retry(&xas, folio))
- continue;
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_unlock(folio);
- rcu_read_lock();
+retry:
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc < 0) {
+ if (rc == -EAGAIN)
+ goto retry;
+ subreq->error = rc;
+ return netfs_prepare_write_failed(subreq);
+ }
+ }
+
+ rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len,
+ &wdata->credits);
+ if (rc < 0) {
+ subreq->error = rc;
+ return netfs_prepare_write_failed(subreq);
}
- rcu_read_unlock();
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
}
/*
- * Completion of write to server.
+ * Issue a subrequest to upload to the server.
*/
-void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct cifs_io_subrequest *wdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_sb_info *sbi = CIFS_SB(subreq->rreq->inode->i_sb);
+ int rc;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (cifs_forced_shutdown(sbi)) {
+ rc = -EIO;
+ goto fail;
+ }
- if (!len)
- return;
+ rc = adjust_credits(wdata->server, &wdata->credits, wdata->subreq.len);
+ if (rc)
+ goto fail;
- rcu_read_lock();
+ rc = -EAGAIN;
+ if (wdata->req->cfile->invalidHandle)
+ goto fail;
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (xas_retry(&xas, folio))
- continue;
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+ wdata->server->ops->async_writev(wdata);
+out:
+ return;
+
+fail:
+ if (rc == -EAGAIN)
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ else
+ trace_netfs_sreq(subreq, netfs_sreq_trace_fail);
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, false);
+ goto out;
+}
+
+/*
+ * Split the read up according to how many credits we can get for each piece.
+ * It's okay to sleep here if we need to wait for more credit to become
+ * available.
+ *
+ * We also choose the server and allocate an operation ID to be cleaned up
+ * later.
+ */
+static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct TCP_Server_Info *server;
+ struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ size_t rsize = 0;
+ int rc;
+
+ rdata->xid = get_xid();
+ rdata->have_xid = true;
+
+ server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
+ rdata->server = server;
+
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
+ cifs_sb->ctx);
- folio_detach_private(folio);
- folio_end_writeback(folio);
+
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize,
+ &rdata->credits);
+ if (rc) {
+ subreq->error = rc;
+ return false;
}
- rcu_read_unlock();
+ subreq->len = min_t(size_t, subreq->len, rsize);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
+ return true;
}
/*
- * Failure of write to server.
+ * Issue a read operation on behalf of the netfs helper functions. We're asked
+ * to make a read of a certain size at a point in the file. We are permitted
+ * to only read a portion of that, but as long as we read something, the netfs
+ * helper will call us again so that we can issue another read.
*/
-void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ pid_t pid;
+ int rc = 0;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ pid = req->cfile->pid;
+ else
+ pid = current->tgid; // Ummm... This may be a workqueue
- if (!len)
- return;
+ cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
+ __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
+ subreq->transferred, subreq->len);
- rcu_read_lock();
+ if (req->cfile->invalidHandle) {
+ do {
+ rc = cifs_reopen_file(req->cfile, true);
+ } while (rc == -EAGAIN);
+ if (rc)
+ goto out;
+ }
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (xas_retry(&xas, folio))
- continue;
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ rdata->pid = pid;
- folio_set_error(folio);
- folio_end_writeback(folio);
+ rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len);
+ if (!rc) {
+ if (rdata->req->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = rdata->server->ops->async_readv(rdata);
}
- rcu_read_unlock();
+out:
+ if (rc)
+ netfs_subreq_terminated(subreq, rc, false);
}
/*
- * Redirty pages after a temporary failure.
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
*/
-void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_begin_writeback(struct netfs_io_request *wreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ struct cifs_io_request *req = container_of(wreq, struct cifs_io_request, rreq);
+ int ret;
- if (!len)
+ ret = cifs_get_writable_file(CIFS_I(wreq->inode), FIND_WR_ANY, &req->cfile);
+ if (ret) {
+ cifs_dbg(VFS, "No writable handle in writepages ret=%d\n", ret);
return;
+ }
- rcu_read_lock();
+ wreq->io_streams[0].avail = true;
+}
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+/*
+ * Initialise a request.
+ */
+static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+ struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ struct cifsFileInfo *open_file = NULL;
- filemap_dirty_folio(folio->mapping, folio);
- folio_end_writeback(folio);
+ rreq->rsize = cifs_sb->ctx->rsize;
+ rreq->wsize = cifs_sb->ctx->wsize;
+
+ if (file) {
+ open_file = file->private_data;
+ rreq->netfs_priv = file->private_data;
+ req->cfile = cifsFileInfo_get(open_file);
+ } else if (rreq->origin != NETFS_WRITEBACK) {
+ WARN_ON_ONCE(1);
+ return -EIO;
}
- rcu_read_unlock();
+ return 0;
}
/*
+ * Expand the size of a readahead to the size of the rsize, if at least as
+ * large as a page, allowing for the possibility that rsize is not pow-2
+ * aligned.
+ */
+static void cifs_expand_readahead(struct netfs_io_request *rreq)
+{
+ unsigned int rsize = rreq->rsize;
+ loff_t misalignment, i_size = i_size_read(rreq->inode);
+
+ if (rsize < PAGE_SIZE)
+ return;
+
+ if (rsize < INT_MAX)
+ rsize = roundup_pow_of_two(rsize);
+ else
+ rsize = ((unsigned int)INT_MAX + 1) / 2;
+
+ misalignment = rreq->start & (rsize - 1);
+ if (misalignment) {
+ rreq->start -= misalignment;
+ rreq->len += misalignment;
+ }
+
+ rreq->len = round_up(rreq->len, rsize);
+ if (rreq->start < i_size && rreq->len > i_size - rreq->start)
+ rreq->len = i_size - rreq->start;
+}
+
+/*
+ * Completion of a request operation.
+ */
+static void cifs_rreq_done(struct netfs_io_request *rreq)
+{
+ struct timespec64 atime, mtime;
+ struct inode *inode = rreq->inode;
+
+ /* we do not want atime to be less than mtime, it broke some apps */
+ atime = inode_set_atime_to_ts(inode, current_time(inode));
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&atime, &mtime))
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
+}
+
+static void cifs_post_modify(struct inode *inode)
+{
+ /* Indication to update ctime and mtime as close is deferred */
+ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
+}
+
+static void cifs_free_request(struct netfs_io_request *rreq)
+{
+ struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+
+ if (req->cfile)
+ cifsFileInfo_put(req->cfile);
+}
+
+static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
+{
+ struct cifs_io_subrequest *rdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ int rc = subreq->error;
+
+ if (rdata->subreq.source == NETFS_DOWNLOAD_FROM_SERVER) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr) {
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
+ }
+
+ add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
+ if (rdata->have_xid)
+ free_xid(rdata->xid);
+}
+
+const struct netfs_request_ops cifs_req_ops = {
+ .request_pool = &cifs_io_request_pool,
+ .subrequest_pool = &cifs_io_subrequest_pool,
+ .init_request = cifs_init_request,
+ .free_request = cifs_free_request,
+ .free_subrequest = cifs_free_subrequest,
+ .expand_readahead = cifs_expand_readahead,
+ .clamp_length = cifs_clamp_length,
+ .issue_read = cifs_req_issue_read,
+ .done = cifs_rreq_done,
+ .post_modify = cifs_post_modify,
+ .begin_writeback = cifs_begin_writeback,
+ .prepare_write = cifs_prepare_write,
+ .issue_write = cifs_issue_write,
+};
+
+/*
* Mark as invalid, all open files on tree connections since they
* were closed when session to server was lost.
*/
@@ -2207,102 +2397,20 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
return rc;
}
-/*
- * update the file size (if needed) after a write. Should be called with
- * the inode->i_lock held
- */
-void
-cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
- unsigned int bytes_written)
-{
- loff_t end_of_write = offset + bytes_written;
-
- if (end_of_write > cifsi->netfs.remote_i_size)
- netfs_resize_file(&cifsi->netfs, end_of_write, true);
-}
-
-static ssize_t
-cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
- size_t write_size, loff_t *offset)
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
+ bool was_async)
{
- int rc = 0;
- unsigned int bytes_written = 0;
- unsigned int total_written;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- unsigned int xid;
- struct dentry *dentry = open_file->dentry;
- struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry));
- struct cifs_io_parms io_parms = {0};
+ struct netfs_io_request *wreq = wdata->rreq;
+ loff_t new_server_eof;
- cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
- write_size, *offset, dentry);
+ if (result > 0) {
+ new_server_eof = wdata->subreq.start + wdata->subreq.transferred + result;
- tcon = tlink_tcon(open_file->tlink);
- server = tcon->ses->server;
-
- if (!server->ops->sync_write)
- return -ENOSYS;
-
- xid = get_xid();
-
- for (total_written = 0; write_size > total_written;
- total_written += bytes_written) {
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- struct kvec iov[2];
- unsigned int len;
-
- if (open_file->invalidHandle) {
- /* we could deadlock if we called
- filemap_fdatawait from here so tell
- reopen_file not to flush data to
- server now */
- rc = cifs_reopen_file(open_file, false);
- if (rc != 0)
- break;
- }
-
- len = min(server->ops->wp_retry_size(d_inode(dentry)),
- (unsigned int)write_size - total_written);
- /* iov[0] is reserved for smb header */
- iov[1].iov_base = (char *)write_data + total_written;
- iov[1].iov_len = len;
- io_parms.pid = pid;
- io_parms.tcon = tcon;
- io_parms.offset = *offset;
- io_parms.length = len;
- rc = server->ops->sync_write(xid, &open_file->fid,
- &io_parms, &bytes_written, iov, 1);
- }
- if (rc || (bytes_written == 0)) {
- if (total_written)
- break;
- else {
- free_xid(xid);
- return rc;
- }
- } else {
- spin_lock(&d_inode(dentry)->i_lock);
- cifs_update_eof(cifsi, *offset, bytes_written);
- spin_unlock(&d_inode(dentry)->i_lock);
- *offset += bytes_written;
- }
+ if (new_server_eof > netfs_inode(wreq->inode)->remote_i_size)
+ netfs_resize_file(netfs_inode(wreq->inode), new_server_eof, true);
}
- cifs_stats_bytes_written(tcon, total_written);
-
- if (total_written > 0) {
- spin_lock(&d_inode(dentry)->i_lock);
- if (*offset > d_inode(dentry)->i_size) {
- i_size_write(d_inode(dentry), *offset);
- d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9;
- }
- spin_unlock(&d_inode(dentry)->i_lock);
- }
- mark_inode_dirty_sync(d_inode(dentry));
- free_xid(xid);
- return total_written;
+ netfs_write_subrequest_terminated(&wdata->subreq, result, was_async);
}
struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
@@ -2509,737 +2617,9 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
return -ENOENT;
}
-void
-cifs_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
-
- if (wdata->cfile)
- cifsFileInfo_put(wdata->cfile);
-
- kfree(wdata);
-}
-
-/*
- * Write failed with a retryable error. Resend the write request. It's also
- * possible that the page was redirtied so re-clean the page.
- */
-static void
-cifs_writev_requeue(struct cifs_writedata *wdata)
-{
- int rc = 0;
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct TCP_Server_Info *server;
- unsigned int rest_len = wdata->bytes;
- loff_t fpos = wdata->offset;
-
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- do {
- struct cifs_writedata *wdata2;
- unsigned int wsize, cur_len;
-
- wsize = server->ops->wp_retry_size(inode);
- if (wsize < rest_len) {
- if (wsize < PAGE_SIZE) {
- rc = -EOPNOTSUPP;
- break;
- }
- cur_len = min(round_down(wsize, PAGE_SIZE), rest_len);
- } else {
- cur_len = rest_len;
- }
-
- wdata2 = cifs_writedata_alloc(cifs_writev_complete);
- if (!wdata2) {
- rc = -ENOMEM;
- break;
- }
-
- wdata2->sync_mode = wdata->sync_mode;
- wdata2->offset = fpos;
- wdata2->bytes = cur_len;
- wdata2->iter = wdata->iter;
-
- iov_iter_advance(&wdata2->iter, fpos - wdata->offset);
- iov_iter_truncate(&wdata2->iter, wdata2->bytes);
-
- if (iov_iter_is_xarray(&wdata2->iter))
- /* Check for pages having been redirtied and clean
- * them. We can do this by walking the xarray. If
- * it's not an xarray, then it's a DIO and we shouldn't
- * be mucking around with the page bits.
- */
- cifs_undirty_folios(inode, fpos, cur_len);
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
- &wdata2->cfile);
- if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
- rc);
- if (!is_retryable_error(rc))
- rc = -EBADF;
- } else {
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2,
- cifs_writedata_release);
- }
-
- kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc) {
- if (is_retryable_error(rc))
- continue;
- fpos += cur_len;
- rest_len -= cur_len;
- break;
- }
-
- fpos += cur_len;
- rest_len -= cur_len;
- } while (rest_len > 0);
-
- /* Clean up remaining pages from the original wdata */
- if (iov_iter_is_xarray(&wdata->iter))
- cifs_pages_write_failed(inode, fpos, rest_len);
-
- if (rc != 0 && !is_retryable_error(rc))
- mapping_set_error(inode->i_mapping, rc);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-void
-cifs_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
-
- if (wdata->result == 0) {
- spin_lock(&inode->i_lock);
- cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
- spin_unlock(&inode->i_lock);
- cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
- wdata->bytes);
- } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
- return cifs_writev_requeue(wdata);
-
- if (wdata->result == -EAGAIN)
- cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes);
- else if (wdata->result < 0)
- cifs_pages_write_failed(inode, wdata->offset, wdata->bytes);
- else
- cifs_pages_written_back(inode, wdata->offset, wdata->bytes);
-
- if (wdata->result != -EAGAIN)
- mapping_set_error(inode->i_mapping, wdata->result);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-struct cifs_writedata *cifs_writedata_alloc(work_func_t complete)
-{
- struct cifs_writedata *wdata;
-
- wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
- if (wdata != NULL) {
- kref_init(&wdata->refcount);
- INIT_LIST_HEAD(&wdata->list);
- init_completion(&wdata->done);
- INIT_WORK(&wdata->work, complete);
- }
- return wdata;
-}
-
-static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
-{
- struct address_space *mapping = page->mapping;
- loff_t offset = (loff_t)page->index << PAGE_SHIFT;
- char *write_data;
- int rc = -EFAULT;
- int bytes_written = 0;
- struct inode *inode;
- struct cifsFileInfo *open_file;
-
- if (!mapping || !mapping->host)
- return -EFAULT;
-
- inode = page->mapping->host;
-
- offset += (loff_t)from;
- write_data = kmap(page);
- write_data += from;
-
- if ((to > PAGE_SIZE) || (from > to)) {
- kunmap(page);
- return -EIO;
- }
-
- /* racing with truncate? */
- if (offset > mapping->host->i_size) {
- kunmap(page);
- return 0; /* don't care */
- }
-
- /* check to make sure that we are not extending the file */
- if (mapping->host->i_size - offset < (loff_t)to)
- to = (unsigned)(mapping->host->i_size - offset);
-
- rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY,
- &open_file);
- if (!rc) {
- bytes_written = cifs_write(open_file, open_file->pid,
- write_data, to - from, &offset);
- cifsFileInfo_put(open_file);
- /* Does mm or vfs already set times? */
- simple_inode_init_ts(inode);
- if ((bytes_written > 0) && (offset))
- rc = 0;
- else if (bytes_written < 0)
- rc = bytes_written;
- else
- rc = -EFAULT;
- } else {
- cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
- if (!is_retryable_error(rc))
- rc = -EIO;
- }
-
- kunmap(page);
- return rc;
-}
-
/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
+ * Flush data on a strict file.
*/
-static void cifs_extend_writeback(struct address_space *mapping,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- int max_pages,
- loff_t max_len,
- size_t *_len)
-{
- struct folio_batch batch;
- struct folio *folio;
- unsigned int nr_pages;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- bool stop = true;
- unsigned int i;
-
- folio_batch_init(&batch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
- nr_pages = folio_nr_pages(folio);
- if (nr_pages > max_pages) {
- xas_reset(xas);
- break;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- max_pages -= nr_pages;
- len = folio_size(folio);
- stop = false;
-
- index += nr_pages;
- *_count -= nr_pages;
- *_len += len;
- if (max_pages <= 0 || *_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&batch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any pages, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&batch))
- break;
-
- for (i = 0; i < folio_batch_count(&batch); i++) {
- folio = batch.folios[i];
- /* The folio should be locked, dirty and not undergoing
- * writeback from the loop above.
- */
- if (!folio_clear_dirty_for_io(folio))
- WARN_ON(1);
- folio_start_writeback(folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&batch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Write back the locked page and any subsequent non-locked dirty pages.
- */
-static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct inode *inode = mapping->host;
- struct TCP_Server_Info *server;
- struct cifs_writedata *wdata;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- struct cifsFileInfo *cfile = NULL;
- unsigned long long i_size = i_size_read(inode), max_len;
- unsigned int xid, wsize;
- size_t len = folio_size(folio);
- long count = wbc->nr_to_write;
- int rc;
-
- /* The folio should be locked, dirty and not undergoing writeback. */
- if (!folio_clear_dirty_for_io(folio))
- WARN_ON_ONCE(1);
- folio_start_writeback(folio);
-
- count -= folio_nr_pages(folio);
-
- xid = get_xid();
- server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
- if (rc) {
- cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc);
- goto err_xid;
- }
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
- &wsize, credits);
- if (rc != 0)
- goto err_close;
-
- wdata = cifs_writedata_alloc(cifs_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- goto err_uncredit;
- }
-
- wdata->sync_mode = wbc->sync_mode;
- wdata->offset = folio_pos(folio);
- wdata->pid = cfile->pid;
- wdata->credits = credits_on_stack;
- wdata->cfile = cfile;
- wdata->server = server;
- cfile = NULL;
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = wsize;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len) {
- int max_pages = INT_MAX;
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_pages = server->smbd_conn->max_frmr_depth;
-#endif
- max_pages -= folio_nr_pages(folio);
-
- if (max_pages > 0)
- cifs_extend_writeback(mapping, xas, &count, start,
- max_pages, max_len, &len);
- }
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
- wdata->bytes = len;
-
- if (start < i_size) {
- iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
- start, len);
-
- rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
- if (rc)
- goto err_wdata;
-
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = wdata->server->ops->async_writev(wdata,
- cifs_writedata_release);
- if (rc >= 0) {
- kref_put(&wdata->refcount, cifs_writedata_release);
- goto err_close;
- }
- } else {
- /* The dirty region was entirely beyond the EOF. */
- cifs_pages_written_back(inode, start, len);
- rc = 0;
- }
-
-err_wdata:
- kref_put(&wdata->refcount, cifs_writedata_release);
-err_uncredit:
- add_credits_and_wake_if(server, credits, 0);
-err_close:
- if (cfile)
- cifsFileInfo_put(cfile);
-err_xid:
- free_xid(xid);
- if (rc == 0) {
- wbc->nr_to_write = count;
- rc = len;
- } else if (is_retryable_error(rc)) {
- cifs_pages_write_redirty(inode, start, len);
- } else {
- cifs_pages_write_failed(inode, start, len);
- mapping_set_error(mapping, rc);
- }
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
- return rc;
-}
-
-/*
- * write a region of pages back to the server
- */
-static ssize_t cifs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- int skips = 0;
-
-search_again:
- /* Find the first dirty page. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_CIFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = cifs_write_back_from_locked_folio(mapping, wbc, xas, folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int cifs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = cifs_writepages_begin(mapping, wbc, &xas, _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * Write some of the pending data back to the server
- */
-static int cifs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- loff_t start, end;
- int ret;
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = cifs_writepages_region(mapping, wbc, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = cifs_writepages_region(mapping, wbc, &start, wbc->range_end);
- }
-
-out:
- return ret;
-}
-
-static int
-cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
-{
- int rc;
- unsigned int xid;
-
- xid = get_xid();
-/* BB add check for wbc flags */
- get_page(page);
- if (!PageUptodate(page))
- cifs_dbg(FYI, "ppw - page not up to date\n");
-
- /*
- * Set the "writeback" flag, and clear "dirty" in the radix tree.
- *
- * A writepage() implementation always needs to do either this,
- * or re-dirty the page with "redirty_page_for_writepage()" in
- * the case of a failure.
- *
- * Just unlocking the page will cause the radix tree tag-bits
- * to fail to update with the state of the page correctly.
- */
- set_page_writeback(page);
-retry_write:
- rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (is_retryable_error(rc)) {
- if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
- goto retry_write;
- redirty_page_for_writepage(wbc, page);
- } else if (rc != 0) {
- SetPageError(page);
- mapping_set_error(page->mapping, rc);
- } else {
- SetPageUptodate(page);
- }
- end_page_writeback(page);
- put_page(page);
- free_xid(xid);
- return rc;
-}
-
-static int cifs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- int rc;
- struct inode *inode = mapping->host;
- struct cifsFileInfo *cfile = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
- struct folio *folio = page_folio(page);
- __u32 pid;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = cfile->pid;
- else
- pid = current->tgid;
-
- cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
- page, pos, copied);
-
- if (folio_test_checked(folio)) {
- if (copied == len)
- folio_mark_uptodate(folio);
- folio_clear_checked(folio);
- } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE)
- folio_mark_uptodate(folio);
-
- if (!folio_test_uptodate(folio)) {
- char *page_data;
- unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned int xid;
-
- xid = get_xid();
- /* this is probably better than directly calling
- partialpage_write since in this function the file handle is
- known which we might as well leverage */
- /* BB check if anything else missing out of ppw
- such as updating last write time */
- page_data = kmap(page);
- rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
- /* if (rc < 0) should we set writebehind rc? */
- kunmap(page);
-
- free_xid(xid);
- } else {
- rc = copied;
- pos += copied;
- set_page_dirty(page);
- }
-
- if (rc > 0) {
- spin_lock(&inode->i_lock);
- if (pos > inode->i_size) {
- loff_t additional_blocks = (512 - 1 + copied) >> 9;
-
- i_size_write(inode, pos);
- /*
- * Estimate new allocation size based on the amount written.
- * This will be updated from server on close (and on queryinfo)
- */
- inode->i_blocks = min_t(blkcnt_t, (512 - 1 + pos) >> 9,
- inode->i_blocks + additional_blocks);
- }
- spin_unlock(&inode->i_lock);
- }
-
- unlock_page(page);
- put_page(page);
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
-
- return rc;
-}
-
int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
@@ -3294,6 +2674,9 @@ strict_fsync_exit:
return rc;
}
+/*
+ * Flush data on a non-strict data.
+ */
int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
unsigned int xid;
@@ -3360,481 +2743,6 @@ int cifs_flush(struct file *file, fl_owner_t id)
return rc;
}
-static void
-cifs_uncached_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-
- kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
- cifs_writedata_release(refcount);
-}
-
-static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);
-
-static void
-cifs_uncached_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
- spin_lock(&inode->i_lock);
- cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
- if (cifsi->netfs.remote_i_size > inode->i_size)
- i_size_write(inode, cifsi->netfs.remote_i_size);
- spin_unlock(&inode->i_lock);
-
- complete(&wdata->done);
- collect_uncached_write_data(wdata->ctx);
- /* the below call can possibly free the last ref to aio ctx */
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
-}
-
-static int
-cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
- struct cifs_aio_ctx *ctx)
-{
- unsigned int wsize;
- struct cifs_credits credits;
- int rc;
- struct TCP_Server_Info *server = wdata->server;
-
- do {
- if (wdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(wdata->cfile, false);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
-
- /*
- * Wait for credits to resend this wdata.
- * Note: we are attempting to resend the whole wdata not in
- * segments
- */
- do {
- rc = server->ops->wait_mtu_credits(server, wdata->bytes,
- &wsize, &credits);
- if (rc)
- goto fail;
-
- if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (wsize < wdata->bytes);
- wdata->credits = credits;
-
- rc = adjust_credits(server, &wdata->credits, wdata->bytes);
-
- if (!rc) {
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else {
- wdata->replay = true;
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- wdata->mr->need_invalidate = true;
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- }
- }
-
- /* If the write was successfully sent, we are done */
- if (!rc) {
- list_add_tail(&wdata->list, wdata_list);
- return 0;
- }
-
- /* Roll back credits and retry if needed */
- add_credits_and_wake_if(server, &wdata->credits, 0);
- } while (rc == -EAGAIN);
-
-fail:
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
- return rc;
-}
-
-/*
- * Select span of a bvec iterator we're going to use. Limit it by both maximum
- * size and maximum number of segments.
- */
-static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size,
- size_t max_segs, unsigned int *_nsegs)
-{
- const struct bio_vec *bvecs = iter->bvec;
- unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
- size_t len, span = 0, n = iter->count;
- size_t skip = iter->iov_offset;
-
- if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0)
- return 0;
-
- while (n && ix < nbv && skip) {
- len = bvecs[ix].bv_len;
- if (skip < len)
- break;
- skip -= len;
- n -= len;
- ix++;
- }
-
- while (n && ix < nbv) {
- len = min3(n, bvecs[ix].bv_len - skip, max_size);
- span += len;
- max_size -= len;
- nsegs++;
- ix++;
- if (max_size == 0 || nsegs >= max_segs)
- break;
- skip = 0;
- n -= len;
- }
-
- *_nsegs = nsegs;
- return span;
-}
-
-static int
-cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from,
- struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
- struct cifs_aio_ctx *ctx)
-{
- int rc = 0;
- size_t cur_len, max_len;
- struct cifs_writedata *wdata;
- pid_t pid;
- struct TCP_Server_Info *server;
- unsigned int xid, max_segs = INT_MAX;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
- xid = get_xid();
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_segs = server->smbd_conn->max_frmr_depth;
-#endif
-
- do {
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- unsigned int wsize, nsegs = 0;
-
- if (signal_pending(current)) {
- rc = -EINTR;
- break;
- }
-
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, false);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
- &wsize, credits);
- if (rc)
- break;
-
- max_len = min_t(const size_t, len, wsize);
- if (!max_len) {
- rc = -EAGAIN;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs);
- cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n",
- cur_len, max_len, nsegs, from->nr_segs, max_segs);
- if (cur_len == 0) {
- rc = -EIO;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- wdata = cifs_writedata_alloc(cifs_uncached_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- wdata->sync_mode = WB_SYNC_ALL;
- wdata->offset = (__u64)fpos;
- wdata->cfile = cifsFileInfo_get(open_file);
- wdata->server = server;
- wdata->pid = pid;
- wdata->bytes = cur_len;
- wdata->credits = credits_on_stack;
- wdata->iter = *from;
- wdata->ctx = ctx;
- kref_get(&ctx->refcount);
-
- iov_iter_truncate(&wdata->iter, cur_len);
-
- rc = adjust_credits(server, &wdata->credits, wdata->bytes);
-
- if (!rc) {
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- }
-
- if (rc) {
- add_credits_and_wake_if(server, &wdata->credits, 0);
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
- if (rc == -EAGAIN)
- continue;
- break;
- }
-
- list_add_tail(&wdata->list, wdata_list);
- iov_iter_advance(from, cur_len);
- fpos += cur_len;
- len -= cur_len;
- } while (len > 0);
-
- free_xid(xid);
- return rc;
-}
-
-static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
-{
- struct cifs_writedata *wdata, *tmp;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct dentry *dentry = ctx->cfile->dentry;
- ssize_t rc;
-
- tcon = tlink_tcon(ctx->cfile->tlink);
- cifs_sb = CIFS_SB(dentry->d_sb);
-
- mutex_lock(&ctx->aio_mutex);
-
- if (list_empty(&ctx->list)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- rc = ctx->rc;
- /*
- * Wait for and collect replies for any successful sends in order of
- * increasing offset. Once an error is hit, then return without waiting
- * for any more replies.
- */
-restart_loop:
- list_for_each_entry_safe(wdata, tmp, &ctx->list, list) {
- if (!rc) {
- if (!try_wait_for_completion(&wdata->done)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- if (wdata->result)
- rc = wdata->result;
- else
- ctx->total_len += wdata->bytes;
-
- /* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- struct list_head tmp_list;
- struct iov_iter tmp_from = ctx->iter;
-
- INIT_LIST_HEAD(&tmp_list);
- list_del_init(&wdata->list);
-
- if (ctx->direct_io)
- rc = cifs_resend_wdata(
- wdata, &tmp_list, ctx);
- else {
- iov_iter_advance(&tmp_from,
- wdata->offset - ctx->pos);
-
- rc = cifs_write_from_iter(wdata->offset,
- wdata->bytes, &tmp_from,
- ctx->cfile, cifs_sb, &tmp_list,
- ctx);
-
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
- }
-
- list_splice(&tmp_list, &ctx->list);
- goto restart_loop;
- }
- }
- list_del_init(&wdata->list);
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
- }
-
- cifs_stats_bytes_written(tcon, ctx->total_len);
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
-
- ctx->rc = (rc == 0) ? ctx->total_len : rc;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
- else
- complete(&ctx->done);
-}
-
-static ssize_t __cifs_writev(
- struct kiocb *iocb, struct iov_iter *from, bool direct)
-{
- struct file *file = iocb->ki_filp;
- ssize_t total_written = 0;
- struct cifsFileInfo *cfile;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct cifs_aio_ctx *ctx;
- int rc;
-
- rc = generic_write_checks(iocb, from);
- if (rc <= 0)
- return rc;
-
- cifs_sb = CIFS_FILE_SB(file);
- cfile = file->private_data;
- tcon = tlink_tcon(cfile->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
-
- ctx = cifs_aio_ctx_alloc();
- if (!ctx)
- return -ENOMEM;
-
- ctx->cfile = cifsFileInfo_get(cfile);
-
- if (!is_sync_kiocb(iocb))
- ctx->iocb = iocb;
-
- ctx->pos = iocb->ki_pos;
- ctx->direct_io = direct;
- ctx->nr_pinned_pages = 0;
-
- if (user_backed_iter(from)) {
- /*
- * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
- * they contain references to the calling process's virtual
- * memory layout which won't be available in an async worker
- * thread. This also takes a pin on every folio involved.
- */
- rc = netfs_extract_user_iter(from, iov_iter_count(from),
- &ctx->iter, 0);
- if (rc < 0) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- ctx->nr_pinned_pages = rc;
- ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(from);
- } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) &&
- !is_sync_kiocb(iocb)) {
- /*
- * If the op is asynchronous, we need to copy the list attached
- * to a BVEC/KVEC-type iterator, but we assume that the storage
- * will be pinned by the caller; in any case, we may or may not
- * be able to pin the pages, so we don't try.
- */
- ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL);
- if (!ctx->bv) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -ENOMEM;
- }
- } else {
- /*
- * Otherwise, we just pass the iterator down as-is and rely on
- * the caller to make sure the pages referred to by the
- * iterator don't evaporate.
- */
- ctx->iter = *from;
- }
-
- ctx->len = iov_iter_count(&ctx->iter);
-
- /* grab a lock here due to read response handlers can access ctx */
- mutex_lock(&ctx->aio_mutex);
-
- rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter,
- cfile, cifs_sb, &ctx->list, ctx);
-
- /*
- * If at least one write was successfully sent, then discard any rc
- * value from the later writes. If the other write succeeds, then
- * we'll end up returning whatever was written. If it fails, then
- * we'll get a new rc value from that.
- */
- if (!list_empty(&ctx->list))
- rc = 0;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- if (!is_sync_kiocb(iocb)) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EIOCBQUEUED;
- }
-
- rc = wait_for_completion_killable(&ctx->done);
- if (rc) {
- mutex_lock(&ctx->aio_mutex);
- ctx->rc = rc = -EINTR;
- total_written = ctx->total_len;
- mutex_unlock(&ctx->aio_mutex);
- } else {
- rc = ctx->rc;
- total_written = ctx->total_len;
- }
-
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
-
- if (unlikely(!total_written))
- return rc;
-
- iocb->ki_pos += total_written;
- return total_written;
-}
-
-ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
-
- cifs_revalidate_mapping(file->f_inode);
- return __cifs_writev(iocb, from, true);
-}
-
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
-{
- return __cifs_writev(iocb, from, false);
-}
-
static ssize_t
cifs_writev(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3845,7 +2753,10 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc;
- inode_lock(inode);
+ rc = netfs_start_io_write(inode);
+ if (rc < 0)
+ return rc;
+
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
@@ -3859,13 +2770,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, 0,
NULL, CIFS_WRITE_OP))
- rc = __generic_file_write_iter(iocb, from);
+ rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
else
rc = -EACCES;
out:
up_read(&cinode->lock_sem);
- inode_unlock(inode);
-
+ netfs_end_io_write(inode);
if (rc > 0)
rc = generic_write_sync(iocb, rc);
return rc;
@@ -3888,9 +2798,9 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
if (CIFS_CACHE_WRITE(cinode)) {
if (cap_unix(tcon->ses) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
- && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
- written = generic_file_write_iter(iocb, from);
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ written = netfs_file_write_iter(iocb, from);
goto out;
}
written = cifs_writev(iocb, from);
@@ -3902,7 +2812,7 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* affected pages because it may cause a error with mandatory locks on
* these pages but not on the region from pos to ppos+len-1.
*/
- written = cifs_user_writev(iocb, from);
+ written = netfs_file_write_iter(iocb, from);
if (CIFS_CACHE_READ(cinode)) {
/*
* We have read level caching and we have just sent a write
@@ -3921,449 +2831,55 @@ out:
return written;
}
-static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete)
-{
- struct cifs_readdata *rdata;
-
- rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
- if (rdata) {
- kref_init(&rdata->refcount);
- INIT_LIST_HEAD(&rdata->list);
- init_completion(&rdata->done);
- INIT_WORK(&rdata->work, complete);
- }
-
- return rdata;
-}
-
-void
-cifs_readdata_release(struct kref *refcount)
-{
- struct cifs_readdata *rdata = container_of(refcount,
- struct cifs_readdata, refcount);
-
- if (rdata->ctx)
- kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (rdata->mr) {
- smbd_deregister_mr(rdata->mr);
- rdata->mr = NULL;
- }
-#endif
- if (rdata->cfile)
- cifsFileInfo_put(rdata->cfile);
-
- kfree(rdata);
-}
-
-static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
-
-static void
-cifs_uncached_readv_complete(struct work_struct *work)
-{
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
-
- complete(&rdata->done);
- collect_uncached_read_data(rdata->ctx);
- /* the below call can possibly free the last ref to aio ctx */
- kref_put(&rdata->refcount, cifs_readdata_release);
-}
-
-static int cifs_resend_rdata(struct cifs_readdata *rdata,
- struct list_head *rdata_list,
- struct cifs_aio_ctx *ctx)
+ssize_t cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- unsigned int rsize;
- struct cifs_credits credits;
- int rc;
- struct TCP_Server_Info *server;
-
- /* XXX: should we pick a new channel here? */
- server = rdata->server;
-
- do {
- if (rdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(rdata->cfile, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- /*
- * Wait for credits to resend this rdata.
- * Note: we are attempting to resend the whole rdata not in
- * segments
- */
- do {
- rc = server->ops->wait_mtu_credits(server, rdata->bytes,
- &rsize, &credits);
-
- if (rc)
- goto fail;
-
- if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (rsize < rdata->bytes);
- rdata->credits = credits;
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else {
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (rdata->mr) {
- rdata->mr->need_invalidate = true;
- smbd_deregister_mr(rdata->mr);
- rdata->mr = NULL;
- }
-#endif
- rc = server->ops->async_readv(rdata);
- }
- }
-
- /* If the read was successfully sent, we are done */
- if (!rc) {
- /* Add to aio pending list */
- list_add_tail(&rdata->list, rdata_list);
- return 0;
- }
-
- /* Roll back credits and retry if needed */
- add_credits_and_wake_if(server, &rdata->credits, 0);
- } while (rc == -EAGAIN);
-
-fail:
- kref_put(&rdata->refcount, cifs_readdata_release);
- return rc;
-}
-
-static int
-cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
- struct cifs_aio_ctx *ctx)
-{
- struct cifs_readdata *rdata;
- unsigned int rsize, nsegs, max_segs = INT_MAX;
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- size_t cur_len, max_len;
- int rc;
- pid_t pid;
- struct TCP_Server_Info *server;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_segs = server->smbd_conn->max_frmr_depth;
-#endif
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- do {
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- if (cifs_sb->ctx->rsize == 0)
- cifs_sb->ctx->rsize =
- server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
- cifs_sb->ctx);
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
- &rsize, credits);
- if (rc)
- break;
-
- max_len = min_t(size_t, len, rsize);
-
- cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len,
- max_segs, &nsegs);
- cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n",
- cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs);
- if (cur_len == 0) {
- rc = -EIO;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata = cifs_readdata_alloc(cifs_uncached_readv_complete);
- if (!rdata) {
- add_credits_and_wake_if(server, credits, 0);
- rc = -ENOMEM;
- break;
- }
-
- rdata->server = server;
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->offset = fpos;
- rdata->bytes = cur_len;
- rdata->pid = pid;
- rdata->credits = credits_on_stack;
- rdata->ctx = ctx;
- kref_get(&ctx->refcount);
-
- rdata->iter = ctx->iter;
- iov_iter_truncate(&rdata->iter, cur_len);
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
-
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_readv(rdata);
- }
+ ssize_t rc;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (rc) {
- add_credits_and_wake_if(server, &rdata->credits, 0);
- kref_put(&rdata->refcount, cifs_readdata_release);
- if (rc == -EAGAIN)
- continue;
- break;
- }
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
- list_add_tail(&rdata->list, rdata_list);
- iov_iter_advance(&ctx->iter, cur_len);
- fpos += cur_len;
- len -= cur_len;
- } while (len > 0);
+ rc = cifs_revalidate_mapping(inode);
+ if (rc)
+ return rc;
- return rc;
+ return netfs_file_read_iter(iocb, iter);
}
-static void
-collect_uncached_read_data(struct cifs_aio_ctx *ctx)
+ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct cifs_readdata *rdata, *tmp;
- struct cifs_sb_info *cifs_sb;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ ssize_t written;
int rc;
- cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
-
- mutex_lock(&ctx->aio_mutex);
-
- if (list_empty(&ctx->list)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- rc = ctx->rc;
- /* the loop below should proceed in the order of increasing offsets */
-again:
- list_for_each_entry_safe(rdata, tmp, &ctx->list, list) {
- if (!rc) {
- if (!try_wait_for_completion(&rdata->done)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- if (rdata->result == -EAGAIN) {
- /* resend call if it's a retryable error */
- struct list_head tmp_list;
- unsigned int got_bytes = rdata->got_bytes;
-
- list_del_init(&rdata->list);
- INIT_LIST_HEAD(&tmp_list);
-
- if (ctx->direct_io) {
- /*
- * Re-use rdata as this is a
- * direct I/O
- */
- rc = cifs_resend_rdata(
- rdata,
- &tmp_list, ctx);
- } else {
- rc = cifs_send_async_read(
- rdata->offset + got_bytes,
- rdata->bytes - got_bytes,
- rdata->cfile, cifs_sb,
- &tmp_list, ctx);
-
- kref_put(&rdata->refcount,
- cifs_readdata_release);
- }
-
- list_splice(&tmp_list, &ctx->list);
-
- goto again;
- } else if (rdata->result)
- rc = rdata->result;
-
- /* if there was a short read -- discard anything left */
- if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
- rc = -ENODATA;
-
- ctx->total_len += rdata->got_bytes;
- }
- list_del_init(&rdata->list);
- kref_put(&rdata->refcount, cifs_readdata_release);
- }
-
- /* mask nodata case */
- if (rc == -ENODATA)
- rc = 0;
-
- ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
- else
- complete(&ctx->done);
-}
-
-static ssize_t __cifs_readv(
- struct kiocb *iocb, struct iov_iter *to, bool direct)
-{
- size_t len;
- struct file *file = iocb->ki_filp;
- struct cifs_sb_info *cifs_sb;
- struct cifsFileInfo *cfile;
- struct cifs_tcon *tcon;
- ssize_t rc, total_read = 0;
- loff_t offset = iocb->ki_pos;
- struct cifs_aio_ctx *ctx;
-
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- cifs_sb = CIFS_FILE_SB(file);
- cfile = file->private_data;
- tcon = tlink_tcon(cfile->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
-
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
- ctx = cifs_aio_ctx_alloc();
- if (!ctx)
- return -ENOMEM;
-
- ctx->pos = offset;
- ctx->direct_io = direct;
- ctx->len = len;
- ctx->cfile = cifsFileInfo_get(cfile);
- ctx->nr_pinned_pages = 0;
-
- if (!is_sync_kiocb(iocb))
- ctx->iocb = iocb;
-
- if (user_backed_iter(to)) {
- /*
- * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
- * they contain references to the calling process's virtual
- * memory layout which won't be available in an async worker
- * thread. This also takes a pin on every folio involved.
- */
- rc = netfs_extract_user_iter(to, iov_iter_count(to),
- &ctx->iter, 0);
- if (rc < 0) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- ctx->nr_pinned_pages = rc;
- ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(to);
- ctx->should_dirty = true;
- } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) &&
- !is_sync_kiocb(iocb)) {
- /*
- * If the op is asynchronous, we need to copy the list attached
- * to a BVEC/KVEC-type iterator, but we assume that the storage
- * will be retained by the caller; in any case, we may or may
- * not be able to pin the pages, so we don't try.
- */
- ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL);
- if (!ctx->bv) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -ENOMEM;
- }
- } else {
- /*
- * Otherwise, we just pass the iterator down as-is and rely on
- * the caller to make sure the pages referred to by the
- * iterator don't evaporate.
- */
- ctx->iter = *to;
- }
-
- if (direct) {
- rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
- offset, offset + len - 1);
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EAGAIN;
+ if (iocb->ki_filp->f_flags & O_DIRECT) {
+ written = netfs_unbuffered_write_iter(iocb, from);
+ if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ cifs_zap_mapping(inode);
+ cifs_dbg(FYI,
+ "Set no oplock for inode=%p after a write operation\n",
+ inode);
+ cinode->oplock = 0;
}
+ return written;
}
- /* grab a lock here due to read response handlers can access ctx */
- mutex_lock(&ctx->aio_mutex);
-
- rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx);
-
- /* if at least one read request send succeeded, then reset rc */
- if (!list_empty(&ctx->list))
- rc = 0;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- if (!is_sync_kiocb(iocb)) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EIOCBQUEUED;
- }
-
- rc = wait_for_completion_killable(&ctx->done);
- if (rc) {
- mutex_lock(&ctx->aio_mutex);
- ctx->rc = rc = -EINTR;
- total_read = ctx->total_len;
- mutex_unlock(&ctx->aio_mutex);
- } else {
- rc = ctx->rc;
- total_read = ctx->total_len;
- }
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ written = netfs_file_write_iter(iocb, from);
- if (total_read) {
- iocb->ki_pos += total_read;
- return total_read;
+ if (!CIFS_CACHE_WRITE(CIFS_I(inode))) {
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (rc)
+ cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
+ rc, inode);
}
- return rc;
-}
-
-ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
-{
- return __cifs_readv(iocb, to, true);
-}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
-{
- return __cifs_readv(iocb, to, false);
+ cifs_put_writer(cinode);
+ return written;
}
ssize_t
@@ -4386,12 +2902,15 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
* pos+len-1.
*/
if (!CIFS_CACHE_READ(cinode))
- return cifs_user_readv(iocb, to);
+ return netfs_unbuffered_read_iter(iocb, to);
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_read_iter(iocb, to);
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
+ return netfs_buffered_read_iter(iocb, to);
+ }
/*
* We need to hold the sem to be sure nobody modifies lock list
@@ -4400,126 +2919,19 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
- 0, NULL, CIFS_READ_OP))
- rc = generic_file_read_iter(iocb, to);
+ 0, NULL, CIFS_READ_OP)) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ rc = netfs_unbuffered_read_iter(iocb, to);
+ else
+ rc = netfs_buffered_read_iter(iocb, to);
+ }
up_read(&cinode->lock_sem);
return rc;
}
-static ssize_t
-cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
-{
- int rc = -EACCES;
- unsigned int bytes_read = 0;
- unsigned int total_read;
- unsigned int current_read_size;
- unsigned int rsize;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- unsigned int xid;
- char *cur_offset;
- struct cifsFileInfo *open_file;
- struct cifs_io_parms io_parms = {0};
- int buf_type = CIFS_NO_BUFFER;
- __u32 pid;
-
- xid = get_xid();
- cifs_sb = CIFS_FILE_SB(file);
-
- /* FIXME: set up handlers for larger reads and/or convert to async */
- rsize = min_t(unsigned int, cifs_sb->ctx->rsize, CIFSMaxBufSize);
-
- if (file->private_data == NULL) {
- rc = -EBADF;
- free_xid(xid);
- return rc;
- }
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
- server = cifs_pick_channel(tcon->ses);
-
- if (!server->ops->sync_read) {
- free_xid(xid);
- return -ENOSYS;
- }
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
- for (total_read = 0, cur_offset = read_data; read_size > total_read;
- total_read += bytes_read, cur_offset += bytes_read) {
- do {
- current_read_size = min_t(uint, read_size - total_read,
- rsize);
- /*
- * For windows me and 9x we do not want to request more
- * than it negotiated since it will refuse the read
- * then.
- */
- if (!(tcon->ses->capabilities &
- tcon->ses->server->vals->cap_large_files)) {
- current_read_size = min_t(uint,
- current_read_size, CIFSMaxBufSize);
- }
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc != 0)
- break;
- }
- io_parms.pid = pid;
- io_parms.tcon = tcon;
- io_parms.offset = *offset;
- io_parms.length = current_read_size;
- io_parms.server = server;
- rc = server->ops->sync_read(xid, &open_file->fid, &io_parms,
- &bytes_read, &cur_offset,
- &buf_type);
- } while (rc == -EAGAIN);
-
- if (rc || (bytes_read == 0)) {
- if (total_read) {
- break;
- } else {
- free_xid(xid);
- return rc;
- }
- } else {
- cifs_stats_bytes_read(tcon, total_read);
- *offset += bytes_read;
- }
- }
- free_xid(xid);
- return total_read;
-}
-
-/*
- * If the page is mmap'ed into a process' page tables, then we need to make
- * sure that it doesn't change while being written back.
- */
static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
-
- /* Wait for the folio to be written to the cache before we allow it to
- * be modified. We then assume the entire folio will need writing back.
- */
-#ifdef CONFIG_CIFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_RETRY;
-#endif
-
- folio_wait_writeback(folio);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- return VM_FAULT_LOCKED;
+ return netfs_page_mkwrite(vmf, NULL);
}
static const struct vm_operations_struct cifs_file_vm_ops = {
@@ -4565,290 +2977,6 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
-/*
- * Unlock a bunch of folios in the pagecache.
- */
-static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last)
-{
- struct folio *folio;
- XA_STATE(xas, &mapping->i_pages, first);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last) {
- folio_unlock(folio);
- }
- rcu_read_unlock();
-}
-
-static void cifs_readahead_complete(struct work_struct *work)
-{
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
- struct folio *folio;
- pgoff_t last;
- bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes);
-
- XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE);
-
- if (good)
- cifs_readahead_to_fscache(rdata->mapping->host,
- rdata->offset, rdata->bytes);
-
- if (iov_iter_count(&rdata->iter) > 0)
- iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter);
-
- last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE;
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last) {
- if (good) {
- flush_dcache_folio(folio);
- folio_mark_uptodate(folio);
- }
- folio_unlock(folio);
- }
- rcu_read_unlock();
-
- kref_put(&rdata->refcount, cifs_readdata_release);
-}
-
-static void cifs_readahead(struct readahead_control *ractl)
-{
- struct cifsFileInfo *open_file = ractl->file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file);
- struct TCP_Server_Info *server;
- unsigned int xid, nr_pages, cache_nr_pages = 0;
- unsigned int ra_pages;
- pgoff_t next_cached = ULONG_MAX, ra_index;
- bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) &&
- cifs_inode_cookie(ractl->mapping->host)->cache_priv;
- bool check_cache = caching;
- pid_t pid;
- int rc = 0;
-
- /* Note that readahead_count() lags behind our dequeuing of pages from
- * the ractl, wo we have to keep track for ourselves.
- */
- ra_pages = readahead_count(ractl);
- ra_index = readahead_index(ractl);
-
- xid = get_xid();
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
-
- cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
- __func__, ractl->file, ractl->mapping, ra_pages);
-
- /*
- * Chop the readahead request up into rsize-sized read requests.
- */
- while ((nr_pages = ra_pages)) {
- unsigned int i, rsize;
- struct cifs_readdata *rdata;
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- struct folio *folio;
- pgoff_t fsize;
-
- /*
- * Find out if we have anything cached in the range of
- * interest, and if so, where the next chunk of cached data is.
- */
- if (caching) {
- if (check_cache) {
- rc = cifs_fscache_query_occupancy(
- ractl->mapping->host, ra_index, nr_pages,
- &next_cached, &cache_nr_pages);
- if (rc < 0)
- caching = false;
- check_cache = false;
- }
-
- if (ra_index == next_cached) {
- /*
- * TODO: Send a whole batch of pages to be read
- * by the cache.
- */
- folio = readahead_folio(ractl);
- fsize = folio_nr_pages(folio);
- ra_pages -= fsize;
- ra_index += fsize;
- if (cifs_readpage_from_fscache(ractl->mapping->host,
- &folio->page) < 0) {
- /*
- * TODO: Deal with cache read failure
- * here, but for the moment, delegate
- * that to readpage.
- */
- caching = false;
- }
- folio_unlock(folio);
- next_cached += fsize;
- cache_nr_pages -= fsize;
- if (cache_nr_pages == 0)
- check_cache = true;
- continue;
- }
- }
-
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc) {
- if (rc == -EAGAIN)
- continue;
- break;
- }
- }
-
- if (cifs_sb->ctx->rsize == 0)
- cifs_sb->ctx->rsize =
- server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
- cifs_sb->ctx);
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
- &rsize, credits);
- if (rc)
- break;
- nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages);
- if (next_cached != ULONG_MAX)
- nr_pages = min_t(size_t, nr_pages, next_cached - ra_index);
-
- /*
- * Give up immediately if rsize is too small to read an entire
- * page. The VFS will fall back to readpage. We should never
- * reach this point however since we set ra_pages to 0 when the
- * rsize is smaller than a cache page.
- */
- if (unlikely(!nr_pages)) {
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata = cifs_readdata_alloc(cifs_readahead_complete);
- if (!rdata) {
- /* best to give up if we're out of mem */
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata->offset = ra_index * PAGE_SIZE;
- rdata->bytes = nr_pages * PAGE_SIZE;
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->server = server;
- rdata->mapping = ractl->mapping;
- rdata->pid = pid;
- rdata->credits = credits_on_stack;
-
- for (i = 0; i < nr_pages; i++) {
- if (!readahead_folio(ractl))
- WARN_ON(1);
- }
- ra_pages -= nr_pages;
- ra_index += nr_pages;
-
- iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages,
- rdata->offset, rdata->bytes);
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_readv(rdata);
- }
-
- if (rc) {
- add_credits_and_wake_if(server, &rdata->credits, 0);
- cifs_unlock_folios(rdata->mapping,
- rdata->offset / PAGE_SIZE,
- (rdata->offset + rdata->bytes - 1) / PAGE_SIZE);
- /* Fallback to the readpage in error/reconnect cases */
- kref_put(&rdata->refcount, cifs_readdata_release);
- break;
- }
-
- kref_put(&rdata->refcount, cifs_readdata_release);
- }
-
- free_xid(xid);
-}
-
-/*
- * cifs_readpage_worker must be called with the page pinned
- */
-static int cifs_readpage_worker(struct file *file, struct page *page,
- loff_t *poffset)
-{
- struct inode *inode = file_inode(file);
- struct timespec64 atime, mtime;
- char *read_data;
- int rc;
-
- /* Is the page cached? */
- rc = cifs_readpage_from_fscache(inode, page);
- if (rc == 0)
- goto read_complete;
-
- read_data = kmap(page);
- /* for reads over a certain size could initiate async read ahead */
-
- rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
-
- if (rc < 0)
- goto io_error;
- else
- cifs_dbg(FYI, "Bytes read %d\n", rc);
-
- /* we do not want atime to be less than mtime, it broke some apps */
- atime = inode_set_atime_to_ts(inode, current_time(inode));
- mtime = inode_get_mtime(inode);
- if (timespec64_compare(&atime, &mtime) < 0)
- inode_set_atime_to_ts(inode, inode_get_mtime(inode));
-
- if (PAGE_SIZE > rc)
- memset(read_data + rc, 0, PAGE_SIZE - rc);
-
- flush_dcache_page(page);
- SetPageUptodate(page);
- rc = 0;
-
-io_error:
- kunmap(page);
-
-read_complete:
- unlock_page(page);
- return rc;
-}
-
-static int cifs_read_folio(struct file *file, struct folio *folio)
-{
- struct page *page = &folio->page;
- loff_t offset = page_file_offset(page);
- int rc = -EACCES;
- unsigned int xid;
-
- xid = get_xid();
-
- if (file->private_data == NULL) {
- rc = -EBADF;
- free_xid(xid);
- return rc;
- }
-
- cifs_dbg(FYI, "read_folio %p at offset %d 0x%x\n",
- page, (int)offset, (int)offset);
-
- rc = cifs_readpage_worker(file, page, &offset);
-
- free_xid(xid);
- return rc;
-}
-
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
@@ -4896,123 +3024,6 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file,
return true;
}
-static int cifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
-{
- int oncethru = 0;
- pgoff_t index = pos >> PAGE_SHIFT;
- loff_t offset = pos & (PAGE_SIZE - 1);
- loff_t page_start = pos & PAGE_MASK;
- loff_t i_size;
- struct page *page;
- int rc = 0;
-
- cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
-
-start:
- page = grab_cache_page_write_begin(mapping, index);
- if (!page) {
- rc = -ENOMEM;
- goto out;
- }
-
- if (PageUptodate(page))
- goto out;
-
- /*
- * If we write a full page it will be up to date, no need to read from
- * the server. If the write is short, we'll end up doing a sync write
- * instead.
- */
- if (len == PAGE_SIZE)
- goto out;
-
- /*
- * optimize away the read when we have an oplock, and we're not
- * expecting to use any of the data we'd be reading in. That
- * is, when the page lies beyond the EOF, or straddles the EOF
- * and the write will cover all of the existing data.
- */
- if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
- i_size = i_size_read(mapping->host);
- if (page_start >= i_size ||
- (offset == 0 && (pos + len) >= i_size)) {
- zero_user_segments(page, 0, offset,
- offset + len,
- PAGE_SIZE);
- /*
- * PageChecked means that the parts of the page
- * to which we're not writing are considered up
- * to date. Once the data is copied to the
- * page, it can be set uptodate.
- */
- SetPageChecked(page);
- goto out;
- }
- }
-
- if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
- /*
- * might as well read a page, it is fast enough. If we get
- * an error, we don't need to return it. cifs_write_end will
- * do a sync write instead since PG_uptodate isn't set.
- */
- cifs_readpage_worker(file, page, &page_start);
- put_page(page);
- oncethru = 1;
- goto start;
- } else {
- /* we could try using another file handle if there is one -
- but how would we lock it to prevent close of that handle
- racing with this read? In any case
- this will be written out by write_end so is fine */
- }
-out:
- *pagep = page;
- return rc;
-}
-
-static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return 0;
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(cifs_inode_cookie(folio->mapping->host));
- return true;
-}
-
-static void cifs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-static int cifs_launder_folio(struct folio *folio)
-{
- int rc = 0;
- loff_t range_start = folio_pos(folio);
- loff_t range_end = range_start + folio_size(folio);
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0,
- .range_start = range_start,
- .range_end = range_end,
- };
-
- cifs_dbg(FYI, "Launder page: %lu\n", folio->index);
-
- if (folio_clear_dirty_for_io(folio))
- rc = cifs_writepage_locked(&folio->page, &wbc);
-
- folio_wait_fscache(folio);
- return rc;
-}
-
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -5102,25 +3113,6 @@ out:
cifs_done_oplock_break(cinode);
}
-/*
- * The presence of cifs_direct_io() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
- * so this method should never be called.
- *
- * Direct IO is not yet supported in the cached mode.
- */
-static ssize_t
-cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
-{
- /*
- * FIXME
- * Eventually need to support direct IO for non forcedirectio mounts
- */
- return -EINVAL;
-}
-
static int cifs_swap_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
{
@@ -5182,22 +3174,19 @@ static void cifs_swap_deactivate(struct file *file)
}
const struct address_space_operations cifs_addr_ops = {
- .read_folio = cifs_read_folio,
- .readahead = cifs_readahead,
- .writepages = cifs_writepages,
- .write_begin = cifs_write_begin,
- .write_end = cifs_write_end,
- .dirty_folio = netfs_dirty_folio,
- .release_folio = cifs_release_folio,
- .direct_IO = cifs_direct_io,
- .invalidate_folio = cifs_invalidate_folio,
- .launder_folio = cifs_launder_folio,
- .migrate_folio = filemap_migrate_folio,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .writepages = netfs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .direct_IO = noop_direct_IO,
+ .invalidate_folio = netfs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
/*
* TODO: investigate and if useful we could add an is_dirty_writeback
* helper if needed
*/
- .swap_activate = cifs_swap_activate,
+ .swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
};
@@ -5207,13 +3196,10 @@ const struct address_space_operations cifs_addr_ops = {
* to leave cifs_readahead out of the address space operations.
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
- .read_folio = cifs_read_folio,
- .writepages = cifs_writepages,
- .write_begin = cifs_write_begin,
- .write_end = cifs_write_end,
- .dirty_folio = netfs_dirty_folio,
- .release_folio = cifs_release_folio,
- .invalidate_folio = cifs_invalidate_folio,
- .launder_folio = cifs_launder_folio,
- .migrate_folio = filemap_migrate_folio,
+ .read_folio = netfs_read_folio,
+ .writepages = netfs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b7bfe705b2c4..3bbac925d076 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("username", Opt_user),
fsparam_string("pass", Opt_pass),
fsparam_string("password", Opt_pass),
+ fsparam_string("password2", Opt_pass2),
fsparam_string("ip", Opt_ip),
fsparam_string("addr", Opt_ip),
fsparam_string("domain", Opt_domain),
@@ -345,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->nodename = NULL;
new_ctx->username = NULL;
new_ctx->password = NULL;
+ new_ctx->password2 = NULL;
new_ctx->server_hostname = NULL;
new_ctx->domainname = NULL;
new_ctx->UNC = NULL;
@@ -357,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(prepath);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
+ DUP_CTX_STR(password2);
DUP_CTX_STR(server_hostname);
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
@@ -745,6 +748,16 @@ static int smb3_fs_context_validate(struct fs_context *fc)
/* set the port that we got earlier */
cifs_set_port((struct sockaddr *)&ctx->dstaddr, ctx->port);
+ if (ctx->uid_specified && !ctx->forceuid_specified) {
+ ctx->override_uid = 1;
+ pr_notice("enabling forceuid mount option implicitly because uid= option is specified\n");
+ }
+
+ if (ctx->gid_specified && !ctx->forcegid_specified) {
+ ctx->override_gid = 1;
+ pr_notice("enabling forcegid mount option implicitly because gid= option is specified\n");
+ }
+
if (ctx->override_uid && !ctx->uid_specified) {
ctx->override_uid = 0;
pr_notice("ignoring forceuid mount option specified with no uid= option\n");
@@ -905,6 +918,8 @@ static int smb3_reconfigure(struct fs_context *fc)
else {
kfree_sensitive(ses->password);
ses->password = kstrdup(ctx->password, GFP_KERNEL);
+ kfree_sensitive(ses->password2);
+ ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
}
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
@@ -1014,12 +1029,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->override_uid = 0;
else
ctx->override_uid = 1;
+ ctx->forceuid_specified = true;
break;
case Opt_forcegid:
if (result.negated)
ctx->override_gid = 0;
else
ctx->override_gid = 1;
+ ctx->forcegid_specified = true;
break;
case Opt_perm:
if (result.negated)
@@ -1305,6 +1322,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
break;
+ case Opt_pass2:
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
+ if (strlen(param->string) == 0)
+ break;
+
+ ctx->password2 = kstrdup(param->string, GFP_KERNEL);
+ if (ctx->password2 == NULL) {
+ cifs_errorf(fc, "OOM when copying password2 string\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_ip:
if (strlen(param->string) == 0) {
ctx->got_ip = false;
@@ -1608,6 +1637,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_parse_mount_err:
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
return -EINVAL;
}
@@ -1713,6 +1744,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->username = NULL;
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
kfree(ctx->server_hostname);
ctx->server_hostname = NULL;
kfree(ctx->UNC);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 8a35645e0b65..cf577ec0dd0a 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -145,6 +145,7 @@ enum cifs_param {
Opt_source,
Opt_user,
Opt_pass,
+ Opt_pass2,
Opt_ip,
Opt_domain,
Opt_srcaddr,
@@ -164,6 +165,8 @@ enum cifs_param {
};
struct smb3_fs_context {
+ bool forceuid_specified;
+ bool forcegid_specified;
bool uid_specified;
bool cruid_specified;
bool gid_specified;
@@ -177,6 +180,7 @@ struct smb3_fs_context {
char *username;
char *password;
+ char *password2;
char *domainname;
char *source;
char *server_hostname;
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index 340efce8f052..01424a5cdb99 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -43,12 +43,23 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
char *key;
int ret = -ENOMEM;
+ if (tcon->fscache_acquired)
+ return 0;
+
+ mutex_lock(&tcon->fscache_lock);
+ if (tcon->fscache_acquired) {
+ mutex_unlock(&tcon->fscache_lock);
+ return 0;
+ }
+ tcon->fscache_acquired = true;
+
tcon->fscache = NULL;
switch (sa->sa_family) {
case AF_INET:
case AF_INET6:
break;
default:
+ mutex_unlock(&tcon->fscache_lock);
cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
return -EINVAL;
}
@@ -57,6 +68,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
sharename = extract_sharename(tcon->tree_name);
if (IS_ERR(sharename)) {
+ mutex_unlock(&tcon->fscache_lock);
cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
return PTR_ERR(sharename);
}
@@ -82,6 +94,11 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
}
pr_err("Cache volume key already in use (%s)\n", key);
vcookie = NULL;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_see_fscache_collision);
+ } else {
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_see_fscache_okay);
}
tcon->fscache = vcookie;
@@ -90,6 +107,7 @@ out_2:
kfree(key);
out:
kfree(sharename);
+ mutex_unlock(&tcon->fscache_lock);
return ret;
}
@@ -102,6 +120,8 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
cifs_fscache_fill_volume_coherency(tcon, &cd);
fscache_relinquish_volume(tcon->fscache, &cd, false);
tcon->fscache = NULL;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_see_fscache_relinq);
}
void cifs_fscache_get_inode_cookie(struct inode *inode)
@@ -150,112 +170,3 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
cifsi->netfs.cache = NULL;
}
}
-
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- struct iov_iter iter;
- struct bio_vec bvec;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- bvec_set_page(&bvec, page, PAGE_SIZE, 0);
- iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE);
-
- ret = fscache_begin_read_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
- NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
-}
-
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len,
- bool no_space_allocated_yet)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- struct iov_iter iter;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len);
-
- ret = fscache_begin_write_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
- no_space_allocated_yet);
- if (ret == 0)
- ret = fscache_write(&cres, start, &iter, NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
-}
-
-/*
- * Retrieve a page from FS-Cache
- */
-int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- int ret;
-
- cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
- __func__, cifs_inode_cookie(inode), page, inode);
-
- ret = fscache_fallback_read_page(inode, page);
- if (ret < 0)
- return ret;
-
- /* Read completed synchronously */
- SetPageUptodate(page);
- return 0;
-}
-
-void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len)
-{
- cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n",
- __func__, cifs_inode_cookie(inode), pos, len, inode);
-
- fscache_fallback_write_pages(inode, pos, len, true);
-}
-
-/*
- * Query the cache occupancy.
- */
-int __cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- loff_t start, data_start;
- size_t len, data_len;
- int ret;
-
- ret = fscache_begin_read_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- start = first * PAGE_SIZE;
- len = nr_pages * PAGE_SIZE;
- ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE,
- &data_start, &data_len);
- if (ret == 0) {
- *_data_first = data_start / PAGE_SIZE;
- *_data_nr_pages = len / PAGE_SIZE;
- }
-
- fscache_end_operation(&cres);
- return ret;
-}
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 1f2ea9f5cc9a..f06cb24f5f3c 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -74,41 +74,6 @@ static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags
i_size_read(inode), flags);
}
-extern int __cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages);
-
-static inline int cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- if (!cifs_inode_cookie(inode))
- return -ENOBUFS;
- return __cifs_fscache_query_occupancy(inode, first, nr_pages,
- _data_first, _data_nr_pages);
-}
-
-extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage);
-extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len);
-
-
-static inline int cifs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- if (cifs_inode_cookie(inode))
- return __cifs_readpage_from_fscache(inode, page);
- return -ENOBUFS;
-}
-
-static inline void cifs_readahead_to_fscache(struct inode *inode,
- loff_t pos, size_t len)
-{
- if (cifs_inode_cookie(inode))
- __cifs_readahead_to_fscache(inode, pos, len);
-}
-
static inline bool cifs_fscache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(cifs_inode_cookie(inode));
@@ -131,25 +96,6 @@ static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { re
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
static inline bool cifs_fscache_enabled(struct inode *inode) { return false; }
-static inline int cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- *_data_first = ULONG_MAX;
- *_data_nr_pages = 0;
- return -ENOBUFS;
-}
-
-static inline int
-cifs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline
-void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {}
-
#endif /* CONFIG_CIFS_FSCACHE */
#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 91b07ef9e25c..e8bfeea23660 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -28,14 +28,29 @@
#include "cached_dir.h"
#include "reparse.h"
+/*
+ * Set parameters for the netfs library
+ */
+static void cifs_set_netfs_context(struct inode *inode)
+{
+ struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+ netfs_inode_init(&cifs_i->netfs, &cifs_req_ops, true);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+ __set_bit(NETFS_ICTX_WRITETHROUGH, &cifs_i->netfs.flags);
+}
+
static void cifs_set_ops(struct inode *inode)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct netfs_inode *ictx = netfs_inode(inode);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &cifs_file_inode_ops;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+ set_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
inode->i_fop = &cifs_file_direct_nobrl_ops;
else
@@ -57,6 +72,7 @@ static void cifs_set_ops(struct inode *inode)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
else
inode->i_data.a_ops = &cifs_addr_ops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case S_IFDIR:
if (IS_AUTOMOUNT(inode)) {
@@ -221,8 +237,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
if (fattr->cf_flags & CIFS_FATTR_JUNCTION)
inode->i_flags |= S_AUTOMOUNT;
- if (inode->i_state & I_NEW)
+ if (inode->i_state & I_NEW) {
+ cifs_set_netfs_context(inode);
cifs_set_ops(inode);
+ }
return 0;
}
@@ -1105,7 +1123,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data,
} else {
cifs_open_info_to_fattr(fattr, data, sb);
}
- if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)
+ if (!rc && *inode &&
+ (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING))
cifs_mark_open_handles_for_deleted_file(*inode, full_path);
break;
case -EREMOTE:
@@ -2430,24 +2449,6 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return false;
}
-/*
- * Zap the cache. Called when invalid_mapping flag is set.
- */
-int
-cifs_invalidate_mapping(struct inode *inode)
-{
- int rc = 0;
-
- if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
- rc = invalidate_inode_pages2(inode->i_mapping);
- if (rc)
- cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
- __func__, inode, rc);
- }
-
- return rc;
-}
-
/**
* cifs_wait_bit_killable - helper for functions that are sleeping on bit locks
*
@@ -2484,9 +2485,12 @@ cifs_revalidate_mapping(struct inode *inode)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
goto skip_invalidate;
- rc = cifs_invalidate_mapping(inode);
- if (rc)
+ rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX);
+ if (rc) {
+ cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
+ __func__, inode, rc);
set_bit(CIFS_INO_INVALID_MAPPING, flags);
+ }
}
skip_invalidate:
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 33ac4f8f5050..07c468ddb88a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -98,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->serverDomain);
kfree(buf_to_free->serverNOS);
kfree_sensitive(buf_to_free->password);
+ kfree_sensitive(buf_to_free->password2);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
kfree_sensitive(buf_to_free->auth_key.response);
@@ -110,9 +111,10 @@ sesInfoFree(struct cifs_ses *buf_to_free)
}
struct cifs_tcon *
-tcon_info_alloc(bool dir_leases_enabled)
+tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
{
struct cifs_tcon *ret_buf;
+ static atomic_t tcon_debug_id;
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
@@ -129,7 +131,8 @@ tcon_info_alloc(bool dir_leases_enabled)
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
- ++ret_buf->tc_count;
+ ret_buf->debug_id = atomic_inc_return(&tcon_debug_id);
+ ret_buf->tc_count = 1;
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
@@ -138,17 +141,22 @@ tcon_info_alloc(bool dir_leases_enabled)
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
+#ifdef CONFIG_CIFS_FSCACHE
+ mutex_init(&ret_buf->fscache_lock);
+#endif
+ trace_smb3_tcon_ref(ret_buf->debug_id, ret_buf->tc_count, trace);
return ret_buf;
}
void
-tconInfoFree(struct cifs_tcon *tcon)
+tconInfoFree(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace)
{
if (tcon == NULL) {
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count, trace);
free_cached_dirs(tcon->cfids);
atomic_dec(&tconInfoAllocCount);
kfree(tcon->nativeFileSystem);
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cc72be5a93a9..677ef6f99a5b 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -767,7 +767,7 @@ smb2_cancelled_close_fid(struct work_struct *work)
if (rc)
cifs_tcon_dbg(VFS, "Close cancelled mid failed rc:%d\n", rc);
- cifs_put_tcon(tcon);
+ cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cancelled_close_fid);
kfree(cancelled);
}
@@ -811,6 +811,8 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
if (tcon->tc_count <= 0) {
struct TCP_Server_Info *server = NULL;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_see_cancelled_close);
WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
spin_unlock(&cifs_tcp_ses_lock);
@@ -823,12 +825,14 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
return 0;
}
tcon->tc_count++;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_get_cancelled_close);
spin_unlock(&cifs_tcp_ses_lock);
rc = __smb2_handle_cancelled_cmd(tcon, SMB2_CLOSE_HE, 0,
persistent_fid, volatile_fid);
if (rc)
- cifs_put_tcon(tcon);
+ cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cancelled_close);
return rc;
}
@@ -856,7 +860,7 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve
rsp->PersistentFileId,
rsp->VolatileFileId);
if (rc)
- cifs_put_tcon(tcon);
+ cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cancelled_mid);
return rc;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index b156eefa75d7..ef18cd30f66c 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -217,8 +217,8 @@ smb2_get_credits(struct mid_q_entry *mid)
}
static int
-smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, struct cifs_credits *credits)
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
+ size_t *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits, in_flight;
@@ -2915,8 +2915,11 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
tcon = list_first_entry_or_null(&ses->tcon_list,
struct cifs_tcon,
tcon_list);
- if (tcon)
+ if (tcon) {
tcon->tc_count++;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_get_dfs_refer);
+ }
spin_unlock(&cifs_tcp_ses_lock);
}
@@ -2980,6 +2983,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
/* ipc tcons are not refcounted */
spin_lock(&cifs_tcp_ses_lock);
tcon->tc_count--;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_dec_dfs_refer);
/* tc_count can never go negative */
WARN_ON(tcon->tc_count < 0);
spin_unlock(&cifs_tcp_ses_lock);
@@ -4485,7 +4490,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_off;
unsigned int cur_page_idx;
unsigned int pad_len;
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
int length;
bool use_rdma_mr = false;
@@ -4587,7 +4592,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
/* Copy the data to the output I/O iterator. */
rdata->result = cifs_copy_pages_to_iter(pages, pages_len,
- cur_off, &rdata->iter);
+ cur_off, &rdata->subreq.io_iter);
if (rdata->result != 0) {
if (is_offloaded)
mid->mid_state = MID_RESPONSE_MALFORMED;
@@ -4601,7 +4606,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
/* read response payload is in buf */
WARN_ONCE(pages && !xa_empty(pages),
"read data can be either in buf or in pages");
- length = copy_to_iter(buf + data_offset, data_len, &rdata->iter);
+ length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);
if (length < 0)
return length;
rdata->got_bytes = data_len;
@@ -4964,68 +4969,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
return 0;
}
-int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
{
- struct cifs_open_info_data buf = {};
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_fid fid;
unsigned int bytes_written;
- struct win_dev *pdev;
+ struct win_dev pdev = {};
struct kvec iov[2];
__u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
int rc;
- if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
+ switch (mode & S_IFMT) {
+ case S_IFCHR:
+ strscpy(pdev.type, "IntxCHR");
+ pdev.major = cpu_to_le64(MAJOR(dev));
+ pdev.minor = cpu_to_le64(MINOR(dev));
+ break;
+ case S_IFBLK:
+ strscpy(pdev.type, "IntxBLK");
+ pdev.major = cpu_to_le64(MAJOR(dev));
+ pdev.minor = cpu_to_le64(MINOR(dev));
+ break;
+ case S_IFIFO:
+ strscpy(pdev.type, "LnxFIFO");
+ break;
+ default:
return -EPERM;
+ }
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = GENERIC_WRITE,
- .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
- CREATE_OPTION_SPECIAL),
- .disposition = FILE_CREATE,
- .path = full_path,
- .fid = &fid,
- };
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE,
+ FILE_CREATE, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL, ACL_NO_MODE);
+ oparms.fid = &fid;
- rc = server->ops->open(xid, &oparms, &oplock, &buf);
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
- pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
- io_parms.length = sizeof(*pdev);
- iov[1].iov_base = pdev;
- iov[1].iov_len = sizeof(*pdev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- } else if (S_ISFIFO(mode)) {
- memcpy(pdev->type, "LnxFIFO", 8);
- }
+ io_parms.length = sizeof(pdev);
+ iov[1].iov_base = &pdev;
+ iov[1].iov_len = sizeof(pdev);
rc = server->ops->sync_write(xid, &fid, &io_parms,
&bytes_written, iov, 1);
server->ops->close(xid, tcon, &fid);
- d_drop(dentry);
- /* FIXME: add code here to set EAs */
- cifs_free_open_info(&buf);
+ return rc;
+}
+
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct inode *new = NULL;
+ int rc;
+
+ rc = __cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ if (rc)
+ return rc;
+
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&new, full_path, NULL,
+ inode->i_sb, xid);
+ } else if (tcon->unix_ext) {
+ rc = cifs_get_inode_info_unix(&new, full_path,
+ inode->i_sb, xid);
+ } else {
+ rc = cifs_get_inode_info(&new, full_path, NULL,
+ inode->i_sb, xid, NULL);
+ }
+ if (!rc)
+ d_instantiate(dentry, new);
return rc;
}
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index c0c4933af5fc..993ac36c3d58 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -23,6 +23,8 @@
#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -367,6 +369,17 @@ again:
}
rc = cifs_setup_session(0, ses, server, nls_codepage);
+ if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect (key rotation
+ * could be enabled on the server e.g.) if an alternate
+ * password is available and the current password is expired,
+ * but do not swap on non pwd related errors like host down
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+
if ((rc == -EACCES) && !tcon->retry) {
mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
@@ -4127,6 +4140,8 @@ void smb2_reconnect_server(struct work_struct *work)
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_get_reconnect_server);
list_add_tail(&tcon->rlist, &tmp_list);
tcon_selected = true;
}
@@ -4165,14 +4180,14 @@ void smb2_reconnect_server(struct work_struct *work)
if (tcon->ipc)
cifs_put_smb_ses(tcon->ses);
else
- cifs_put_tcon(tcon);
+ cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_reconnect_server);
}
if (!ses_exist)
goto done;
/* allocate a dummy tcon struct used for reconnect */
- tcon = tcon_info_alloc(false);
+ tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_reconnect_server);
if (!tcon) {
resched = true;
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
@@ -4195,7 +4210,7 @@ void smb2_reconnect_server(struct work_struct *work)
list_del_init(&ses->rlist);
cifs_put_smb_ses(ses);
}
- tconInfoFree(tcon);
+ tconInfoFree(tcon, netfs_trace_tcon_ref_free_reconnect_server);
done:
cifs_dbg(FYI, "Reconnecting tcons and channels finished\n");
@@ -4378,7 +4393,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms)
*/
static int
smb2_new_read_req(void **buf, unsigned int *total_len,
- struct cifs_io_parms *io_parms, struct cifs_readdata *rdata,
+ struct cifs_io_parms *io_parms, struct cifs_io_subrequest *rdata,
unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
@@ -4406,10 +4421,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
- trace_smb3_read_enter(0 /* xid */,
- io_parms->persistent_fid,
- io_parms->tcon->tid, io_parms->tcon->ses->Suid,
- io_parms->offset, io_parms->length);
+ trace_smb3_read_enter(rdata ? rdata->rreq->debug_id : 0,
+ rdata ? rdata->subreq.debug_index : 0,
+ rdata ? rdata->xid : 0,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -4419,7 +4436,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
struct smbd_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter,
+ rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter,
true, need_invalidate);
if (!rdata->mr)
return -EAGAIN;
@@ -4470,8 +4487,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
static void
smb2_readv_callback(struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
struct smb2_hdr *shdr =
(struct smb2_hdr *)rdata->iov[0].iov_base;
@@ -4479,17 +4496,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1 };
if (rdata->got_bytes) {
- rqst.rq_iter = rdata->iter;
- rqst.rq_iter_size = iov_iter_count(&rdata->iter);
+ rqst.rq_iter = rdata->subreq.io_iter;
+ rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter);
}
WARN_ONCE(rdata->server != mid->server,
"rdata server %p != mid server %p",
rdata->server, mid->server);
- cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
- rdata->bytes);
+ rdata->subreq.len);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -4499,7 +4516,6 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (server->sign && !mid->decrypted) {
int rc;
- iov_iter_revert(&rqst.rq_iter, rdata->got_bytes);
iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes);
rc = smb2_verify_signature(&rqst, server);
if (rc)
@@ -4540,24 +4556,40 @@ smb2_readv_callback(struct mid_q_entry *mid)
#endif
if (rdata->result && rdata->result != -ENODATA) {
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
- trace_smb3_read_err(0 /* xid */,
- rdata->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, rdata->offset,
- rdata->bytes, rdata->result);
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, rdata->subreq.start,
+ rdata->subreq.len, rdata->result);
} else
- trace_smb3_read_done(0 /* xid */,
- rdata->cfile->fid.persistent_fid,
+ trace_smb3_read_done(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
- rdata->offset, rdata->got_bytes);
+ rdata->subreq.start, rdata->got_bytes);
- queue_work(cifsiod_wq, &rdata->work);
+ if (rdata->result == -ENODATA) {
+ /* We may have got an EOF error because fallocate
+ * failed to enlarge the file.
+ */
+ if (rdata->subreq.start < rdata->subreq.rreq->i_size)
+ rdata->result = 0;
+ }
+ if (rdata->result == 0 || rdata->result == -EAGAIN)
+ iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
+ rdata->credits.value = 0;
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result, true);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
int
-smb2_async_readv(struct cifs_readdata *rdata)
+smb2_async_readv(struct cifs_io_subrequest *rdata)
{
int rc, flags = 0;
char *buf;
@@ -4566,22 +4598,22 @@ smb2_async_readv(struct cifs_readdata *rdata)
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1 };
struct TCP_Server_Info *server;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
unsigned int total_len;
int credit_request;
- cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
- __func__, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
+ __func__, rdata->subreq.start, rdata->subreq.len);
if (!rdata->server)
rdata->server = cifs_pick_channel(tcon->ses);
- io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
+ io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink);
io_parms.server = server = rdata->server;
- io_parms.offset = rdata->offset;
- io_parms.length = rdata->bytes;
- io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
- io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
+ io_parms.offset = rdata->subreq.start;
+ io_parms.length = rdata->subreq.len;
+ io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
+ io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
io_parms.pid = rdata->pid;
rc = smb2_new_read_req(
@@ -4598,7 +4630,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@@ -4608,22 +4640,22 @@ smb2_async_readv(struct cifs_readdata *rdata)
min_t(int, server->max_credits -
server->credits, credit_request));
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ rc = adjust_credits(server, &rdata->credits, rdata->subreq.len);
if (rc)
goto async_readv_out;
flags |= CIFS_HAS_CREDITS;
}
- kref_get(&rdata->refcount);
rc = cifs_call_async(server, &rqst,
cifs_readv_receive, smb2_readv_callback,
smb3_handle_read_data, rdata, flags,
&rdata->credits);
if (rc) {
- kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid, io_parms.persistent_fid,
io_parms.tcon->tid,
io_parms.tcon->ses->Suid,
io_parms.offset, io_parms.length, rc);
@@ -4674,22 +4706,23 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- trace_smb3_read_err(xid,
+ trace_smb3_read_err(0, 0, xid,
req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid,
+ trace_smb3_read_done(0, 0, xid,
+ req->PersistentFileId, io_parms->tcon->tid,
ses->Suid, io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
- trace_smb3_read_done(xid,
- req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, io_parms->length);
+ trace_smb3_read_done(0, 0, xid,
+ req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
cifs_small_buf_release(req);
@@ -4722,12 +4755,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
static void
smb2_writev_callback(struct mid_q_entry *mid)
{
- struct cifs_writedata *wdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_io_subrequest *wdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
- unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
struct cifs_credits credits = { .value = 0, .instance = 0 };
+ ssize_t result = 0;
+ size_t written;
WARN_ONCE(wdata->server != mid->server,
"wdata server %p != mid server %p",
@@ -4737,8 +4771,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RESPONSE_RECEIVED:
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- wdata->result = smb2_check_receive(mid, server, 0);
- if (wdata->result != 0)
+ result = smb2_check_receive(mid, server, 0);
+ if (result != 0)
break;
written = le32_to_cpu(rsp->DataLength);
@@ -4748,24 +4782,25 @@ smb2_writev_callback(struct mid_q_entry *mid)
* client. OS/2 servers are known to set incorrect
* CountHigh values.
*/
- if (written > wdata->bytes)
+ if (written > wdata->subreq.len)
written &= 0xFFFF;
- if (written < wdata->bytes)
+ if (written < wdata->subreq.len)
wdata->result = -ENOSPC;
else
- wdata->bytes = written;
+ wdata->subreq.len = written;
+ iov_iter_advance(&wdata->subreq.io_iter, written);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
- wdata->result = -EAGAIN;
+ result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
fallthrough;
default:
- wdata->result = -EIO;
+ result = -EIO;
break;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
@@ -4781,44 +4816,44 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->mr = NULL;
}
#endif
- if (wdata->result) {
+ if (result) {
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- trace_smb3_write_err(0 /* no xid */,
- wdata->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, wdata->offset,
- wdata->bytes, wdata->result);
+ trace_smb3_write_err(wdata->xid,
+ wdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->subreq.start,
+ wdata->subreq.len, wdata->result);
if (wdata->result == -ENOSPC)
pr_warn_once("Out of space writing to %s\n",
tcon->tree_name);
} else
trace_smb3_write_done(0 /* no xid */,
- wdata->cfile->fid.persistent_fid,
+ wdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
- wdata->offset, wdata->bytes);
+ wdata->subreq.start, wdata->subreq.len);
- queue_work(cifsiod_wq, &wdata->work);
+ wdata->credits.value = 0;
+ cifs_write_subrequest_terminated(wdata, result ?: written, true);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
-int
-smb2_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref))
+void
+smb2_async_writev(struct cifs_io_subrequest *wdata)
{
int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
struct smb2_hdr *shdr;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct kvec iov[1];
struct smb_rqst rqst = { };
- unsigned int total_len;
+ unsigned int total_len, xid = wdata->xid;
struct cifs_io_parms _io_parms;
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server || wdata->replay)
+ if (!wdata->server || test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
server = wdata->server = cifs_pick_channel(tcon->ses);
/*
@@ -4828,10 +4863,10 @@ smb2_async_writev(struct cifs_writedata *wdata,
_io_parms = (struct cifs_io_parms) {
.tcon = tcon,
.server = server,
- .offset = wdata->offset,
- .length = wdata->bytes,
- .persistent_fid = wdata->cfile->fid.persistent_fid,
- .volatile_fid = wdata->cfile->fid.volatile_fid,
+ .offset = wdata->subreq.start,
+ .length = wdata->subreq.len,
+ .persistent_fid = wdata->req->cfile->fid.persistent_fid,
+ .volatile_fid = wdata->req->cfile->fid.volatile_fid,
.pid = wdata->pid,
};
io_parms = &_io_parms;
@@ -4839,7 +4874,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
rc = smb2_plain_req_init(SMB2_WRITE, tcon, server,
(void **) &req, &total_len);
if (rc)
- return rc;
+ goto out;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -4857,7 +4892,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
- trace_smb3_write_enter(0 /* xid */,
+ trace_smb3_write_enter(wdata->xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
@@ -4871,10 +4906,10 @@ smb2_async_writev(struct cifs_writedata *wdata,
*/
if (smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
- size_t data_size = iov_iter_count(&wdata->iter);
+ size_t data_size = iov_iter_count(&wdata->subreq.io_iter);
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter,
+ wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
false, need_invalidate);
if (!wdata->mr) {
rc = -EAGAIN;
@@ -4901,9 +4936,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rqst.rq_iter = wdata->iter;
+ rqst.rq_iter = wdata->subreq.io_iter;
rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
- if (wdata->replay)
+ if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
smb2_set_replay(server, &rqst);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr)
@@ -4921,7 +4956,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
#endif
if (wdata->credits.value > 0) {
- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@@ -4938,25 +4973,27 @@ smb2_async_writev(struct cifs_writedata *wdata,
flags |= CIFS_HAS_CREDITS;
}
- kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
wdata, flags, &wdata->credits);
-
+ /* Can't touch wdata if rc == 0 */
if (rc) {
- trace_smb3_write_err(0 /* no xid */,
+ trace_smb3_write_err(xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset,
io_parms->length,
rc);
- kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
}
async_writev_out:
cifs_small_buf_release(req);
- return rc;
+out:
+ if (rc) {
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, true);
+ }
}
/*
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index c72a3b2886b7..2fccf0d4f53d 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -320,7 +320,7 @@ struct smb2_file_reparse_point_info {
} __packed;
struct smb2_file_network_open_info {
- struct_group(network_open_info,
+ struct_group_attr(network_open_info, __packed,
__le64 CreationTime;
__le64 LastAccessTime;
__le64 LastWriteTime;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 732169d8a67a..b208232b12a2 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -210,11 +210,10 @@ extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le64 *uniqueid);
-extern int smb2_async_readv(struct cifs_readdata *rdata);
+extern int smb2_async_readv(struct cifs_io_subrequest *rdata);
extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, char **buf, int *buf_type);
-extern int smb2_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref));
+extern void smb2_async_writev(struct cifs_io_subrequest *wdata);
extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, struct kvec *iov, int n_vec);
extern int SMB2_echo(struct TCP_Server_Info *server);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 1d6e54f7879e..02135a605305 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -189,6 +189,8 @@ smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32 tid)
if (tcon->tid != tid)
continue;
++tcon->tc_count;
+ trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
+ netfs_trace_tcon_ref_get_find_sess_tcon);
return tcon;
}
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 5e83cb9da902..af97389e983e 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -3,6 +3,9 @@
* Copyright (C) 2018, Microsoft Corporation.
*
* Author(s): Steve French <stfrench@microsoft.com>
+ *
+ * Please use this 3-part article as a reference for writing new tracepoints:
+ * https://lwn.net/Articles/379903/
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cifs
@@ -15,12 +18,129 @@
#include <linux/inet.h>
/*
- * Please use this 3-part article as a reference for writing new tracepoints:
- * https://lwn.net/Articles/379903/
+ * Specify enums for tracing information.
*/
+#define smb3_tcon_ref_traces \
+ EM(netfs_trace_tcon_ref_dec_dfs_refer, "DEC DfsRef") \
+ EM(netfs_trace_tcon_ref_free, "FRE ") \
+ EM(netfs_trace_tcon_ref_free_fail, "FRE Fail ") \
+ EM(netfs_trace_tcon_ref_free_ipc, "FRE Ipc ") \
+ EM(netfs_trace_tcon_ref_free_ipc_fail, "FRE Ipc-F ") \
+ EM(netfs_trace_tcon_ref_free_reconnect_server, "FRE Reconn") \
+ EM(netfs_trace_tcon_ref_get_cancelled_close, "GET Cn-Cls") \
+ EM(netfs_trace_tcon_ref_get_dfs_refer, "GET DfsRef") \
+ EM(netfs_trace_tcon_ref_get_find, "GET Find ") \
+ EM(netfs_trace_tcon_ref_get_find_sess_tcon, "GET FndSes") \
+ EM(netfs_trace_tcon_ref_get_reconnect_server, "GET Reconn") \
+ EM(netfs_trace_tcon_ref_new, "NEW ") \
+ EM(netfs_trace_tcon_ref_new_ipc, "NEW Ipc ") \
+ EM(netfs_trace_tcon_ref_new_reconnect_server, "NEW Reconn") \
+ EM(netfs_trace_tcon_ref_put_cancelled_close, "PUT Cn-Cls") \
+ EM(netfs_trace_tcon_ref_put_cancelled_close_fid, "PUT Cn-Fid") \
+ EM(netfs_trace_tcon_ref_put_cancelled_mid, "PUT Cn-Mid") \
+ EM(netfs_trace_tcon_ref_put_mnt_ctx, "PUT MntCtx") \
+ EM(netfs_trace_tcon_ref_put_reconnect_server, "PUT Reconn") \
+ EM(netfs_trace_tcon_ref_put_tlink, "PUT Tlink ") \
+ EM(netfs_trace_tcon_ref_see_cancelled_close, "SEE Cn-Cls") \
+ EM(netfs_trace_tcon_ref_see_fscache_collision, "SEE FV-CO!") \
+ EM(netfs_trace_tcon_ref_see_fscache_okay, "SEE FV-Ok ") \
+ EM(netfs_trace_tcon_ref_see_fscache_relinq, "SEE FV-Rlq") \
+ E_(netfs_trace_tcon_ref_see_umount, "SEE Umount")
+
+#undef EM
+#undef E_
+
+/*
+ * Define those tracing enums.
+ */
+#ifndef __SMB3_DECLARE_TRACE_ENUMS_ONCE_ONLY
+#define __SMB3_DECLARE_TRACE_ENUMS_ONCE_ONLY
+
+#define EM(a, b) a,
+#define E_(a, b) a
+
+enum smb3_tcon_ref_trace { smb3_tcon_ref_traces } __mode(byte);
+
+#undef EM
+#undef E_
+#endif
+
+/*
+ * Export enum symbols via userspace.
+ */
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define E_(a, b) TRACE_DEFINE_ENUM(a);
+
+smb3_tcon_ref_traces;
+
+#undef EM
+#undef E_
+
+/*
+ * Now redefine the EM() and E_() macros to map the enums to the strings that
+ * will be printed in the output.
+ */
+#define EM(a, b) { a, b },
+#define E_(a, b) { a, b }
/* For logging errors in read or write */
DECLARE_EVENT_CLASS(smb3_rw_err_class,
+ TP_PROTO(unsigned int rreq_debug_id,
+ unsigned int rreq_debug_index,
+ unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len,
+ int rc),
+ TP_ARGS(rreq_debug_id, rreq_debug_index,
+ xid, fid, tid, sesid, offset, len, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq_debug_id)
+ __field(unsigned int, rreq_debug_index)
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->rreq_debug_id = rreq_debug_id;
+ __entry->rreq_debug_index = rreq_debug_index;
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->rc = rc;
+ ),
+ TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ __entry->rreq_debug_id, __entry->rreq_debug_index,
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_RW_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+ TP_PROTO(unsigned int rreq_debug_id, \
+ unsigned int rreq_debug_index, \
+ unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len, \
+ int rc), \
+ TP_ARGS(rreq_debug_id, rreq_debug_index, xid, fid, tid, sesid, offset, len, rc))
+
+DEFINE_SMB3_RW_ERR_EVENT(read_err);
+
+/* For logging errors in other file I/O ops */
+DECLARE_EVENT_CLASS(smb3_other_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
__u32 tid,
@@ -52,8 +172,8 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class,
__entry->offset, __entry->len, __entry->rc)
)
-#define DEFINE_SMB3_RW_ERR_EVENT(name) \
-DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+#define DEFINE_SMB3_OTHER_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_other_err_class, smb3_##name, \
TP_PROTO(unsigned int xid, \
__u64 fid, \
__u32 tid, \
@@ -63,15 +183,67 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
int rc), \
TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
-DEFINE_SMB3_RW_ERR_EVENT(write_err);
-DEFINE_SMB3_RW_ERR_EVENT(read_err);
-DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
-DEFINE_SMB3_RW_ERR_EVENT(zero_err);
-DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(write_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(zero_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err);
/* For logging successful read or write */
DECLARE_EVENT_CLASS(smb3_rw_done_class,
+ TP_PROTO(unsigned int rreq_debug_id,
+ unsigned int rreq_debug_index,
+ unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len),
+ TP_ARGS(rreq_debug_id, rreq_debug_index,
+ xid, fid, tid, sesid, offset, len),
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq_debug_id)
+ __field(unsigned int, rreq_debug_index)
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ ),
+ TP_fast_assign(
+ __entry->rreq_debug_id = rreq_debug_id;
+ __entry->rreq_debug_index = rreq_debug_index;
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x",
+ __entry->rreq_debug_id, __entry->rreq_debug_index,
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_SMB3_RW_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+ TP_PROTO(unsigned int rreq_debug_id, \
+ unsigned int rreq_debug_index, \
+ unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len), \
+ TP_ARGS(rreq_debug_id, rreq_debug_index, xid, fid, tid, sesid, offset, len))
+
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_done);
+
+/* For logging successful other op */
+DECLARE_EVENT_CLASS(smb3_other_done_class,
TP_PROTO(unsigned int xid,
__u64 fid,
__u32 tid,
@@ -100,8 +272,8 @@ DECLARE_EVENT_CLASS(smb3_rw_done_class,
__entry->offset, __entry->len)
)
-#define DEFINE_SMB3_RW_DONE_EVENT(name) \
-DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+#define DEFINE_SMB3_OTHER_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_other_done_class, smb3_##name, \
TP_PROTO(unsigned int xid, \
__u64 fid, \
__u32 tid, \
@@ -110,16 +282,14 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
-DEFINE_SMB3_RW_DONE_EVENT(write_enter);
-DEFINE_SMB3_RW_DONE_EVENT(read_enter);
-DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
-DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
-DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
-DEFINE_SMB3_RW_DONE_EVENT(write_done);
-DEFINE_SMB3_RW_DONE_EVENT(read_done);
-DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
-DEFINE_SMB3_RW_DONE_EVENT(zero_done);
-DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(write_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(zero_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(falloc_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(write_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(zero_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(falloc_done);
/* For logging successful set EOF (truncate) */
DECLARE_EVENT_CLASS(smb3_eof_class,
@@ -1125,6 +1295,30 @@ DEFINE_SMB3_CREDIT_EVENT(waitff_credits);
DEFINE_SMB3_CREDIT_EVENT(overflow_credits);
DEFINE_SMB3_CREDIT_EVENT(set_credits);
+
+TRACE_EVENT(smb3_tcon_ref,
+ TP_PROTO(unsigned int tcon_debug_id, int ref,
+ enum smb3_tcon_ref_trace trace),
+ TP_ARGS(tcon_debug_id, ref, trace),
+ TP_STRUCT__entry(
+ __field(unsigned int, tcon)
+ __field(int, ref)
+ __field(enum smb3_tcon_ref_trace, trace)
+ ),
+ TP_fast_assign(
+ __entry->tcon = tcon_debug_id;
+ __entry->ref = ref;
+ __entry->trace = trace;
+ ),
+ TP_printk("TC=%08x %s r=%u",
+ __entry->tcon,
+ __print_symbolic(__entry->trace, smb3_tcon_ref_traces),
+ __entry->ref)
+ );
+
+
+#undef EM
+#undef E_
#endif /* _CIFS_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 994d70193432..012b9bd06995 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -691,8 +691,8 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
}
int
-cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, struct cifs_credits *credits)
+cifs_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
+ size_t *num, struct cifs_credits *credits)
{
*num = size;
credits->value = 0;
@@ -909,12 +909,15 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
}
+ spin_unlock(&server->mid_lock);
cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
__func__, mid->mid, mid->mid_state);
rc = -EIO;
+ goto sync_mid_done;
}
spin_unlock(&server->mid_lock);
+sync_mid_done:
release_mid(mid);
return rc;
}
@@ -1057,9 +1060,11 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
index = (uint)atomic_inc_return(&ses->chan_seq);
index %= ses->chan_count;
}
+
+ server = ses->chans[index].server;
spin_unlock(&ses->chan_lock);
- return ses->chans[index].server;
+ return server;
}
int
@@ -1687,7 +1692,7 @@ __cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
static int
cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
return __cifs_readv_discard(server, mid, rdata->result);
}
@@ -1697,13 +1702,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length, len;
unsigned int data_offset, data_len;
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
char *buf = server->smallbuf;
unsigned int buflen = server->pdu_size + HEADER_PREAMBLE_SIZE(server);
bool use_rdma_mr = false;
- cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
- __func__, mid->mid, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%zu\n",
+ __func__, mid->mid, rdata->subreq.start, rdata->subreq.len);
/*
* read the rest of READ_RSP header (sans Data array), or whatever we
@@ -1808,8 +1813,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
length = data_len; /* An RDMA read is already done. */
else
#endif
- length = cifs_read_iter_from_socket(server, &rdata->iter,
+ {
+ length = cifs_read_iter_from_socket(server, &rdata->subreq.io_iter,
data_len);
+ iov_iter_revert(&rdata->subreq.io_iter, data_len);
+ }
if (length > 0)
rdata->got_bytes += length;
server->total_read += length;
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 1b594307c9d5..202ff9128156 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -711,7 +711,7 @@ struct smb2_close_rsp {
__le16 StructureSize; /* 60 */
__le16 Flags;
__le32 Reserved;
- struct_group(network_open_info,
+ struct_group_attr(network_open_info, __packed,
__le64 CreationTime;
__le64 LastAccessTime;
__le64 LastWriteTime;
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 686b321c5a8b..f4e55199938d 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -340,23 +340,24 @@ enum KSMBD_TREE_CONN_STATUS {
/*
* Share config flags.
*/
-#define KSMBD_SHARE_FLAG_INVALID (0)
-#define KSMBD_SHARE_FLAG_AVAILABLE BIT(0)
-#define KSMBD_SHARE_FLAG_BROWSEABLE BIT(1)
-#define KSMBD_SHARE_FLAG_WRITEABLE BIT(2)
-#define KSMBD_SHARE_FLAG_READONLY BIT(3)
-#define KSMBD_SHARE_FLAG_GUEST_OK BIT(4)
-#define KSMBD_SHARE_FLAG_GUEST_ONLY BIT(5)
-#define KSMBD_SHARE_FLAG_STORE_DOS_ATTRS BIT(6)
-#define KSMBD_SHARE_FLAG_OPLOCKS BIT(7)
-#define KSMBD_SHARE_FLAG_PIPE BIT(8)
-#define KSMBD_SHARE_FLAG_HIDE_DOT_FILES BIT(9)
-#define KSMBD_SHARE_FLAG_INHERIT_OWNER BIT(10)
-#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
-#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
-#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
-#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
-#define KSMBD_SHARE_FLAG_CROSSMNT BIT(15)
+#define KSMBD_SHARE_FLAG_INVALID (0)
+#define KSMBD_SHARE_FLAG_AVAILABLE BIT(0)
+#define KSMBD_SHARE_FLAG_BROWSEABLE BIT(1)
+#define KSMBD_SHARE_FLAG_WRITEABLE BIT(2)
+#define KSMBD_SHARE_FLAG_READONLY BIT(3)
+#define KSMBD_SHARE_FLAG_GUEST_OK BIT(4)
+#define KSMBD_SHARE_FLAG_GUEST_ONLY BIT(5)
+#define KSMBD_SHARE_FLAG_STORE_DOS_ATTRS BIT(6)
+#define KSMBD_SHARE_FLAG_OPLOCKS BIT(7)
+#define KSMBD_SHARE_FLAG_PIPE BIT(8)
+#define KSMBD_SHARE_FLAG_HIDE_DOT_FILES BIT(9)
+#define KSMBD_SHARE_FLAG_INHERIT_OWNER BIT(10)
+#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
+#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
+#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
+#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
+#define KSMBD_SHARE_FLAG_CROSSMNT BIT(15)
+#define KSMBD_SHARE_FLAG_CONTINUOUS_AVAILABILITY BIT(16)
/*
* Tree connect request flags.
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 4978edfb15f9..b9d9116fc2b3 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -207,9 +207,9 @@ static void opinfo_add(struct oplock_info *opinfo)
{
struct ksmbd_inode *ci = opinfo->o_fp->f_ci;
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
list_add_rcu(&opinfo->op_entry, &ci->m_op_list);
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
}
static void opinfo_del(struct oplock_info *opinfo)
@@ -221,9 +221,9 @@ static void opinfo_del(struct oplock_info *opinfo)
lease_del_list(opinfo);
write_unlock(&lease_list_lock);
}
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
list_del_rcu(&opinfo->op_entry);
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
}
static unsigned long opinfo_count(struct ksmbd_file *fp)
@@ -526,21 +526,18 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
* Compare lease key and client_guid to know request from same owner
* of same client
*/
- read_lock(&ci->m_lock);
+ down_read(&ci->m_lock);
list_for_each_entry(opinfo, &ci->m_op_list, op_entry) {
if (!opinfo->is_lease || !opinfo->conn)
continue;
- read_unlock(&ci->m_lock);
lease = opinfo->o_lease;
ret = compare_guid_key(opinfo, client_guid, lctx->lease_key);
if (ret) {
m_opinfo = opinfo;
/* skip upgrading lease about breaking lease */
- if (atomic_read(&opinfo->breaking_cnt)) {
- read_lock(&ci->m_lock);
+ if (atomic_read(&opinfo->breaking_cnt))
continue;
- }
/* upgrading lease */
if ((atomic_read(&ci->op_count) +
@@ -570,9 +567,8 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
lease_none_upgrade(opinfo, lctx->req_state);
}
}
- read_lock(&ci->m_lock);
}
- read_unlock(&ci->m_lock);
+ up_read(&ci->m_lock);
return m_opinfo;
}
@@ -613,13 +609,23 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level)
if (opinfo->op_state == OPLOCK_CLOSING)
return -ENOENT;
- else if (!opinfo->is_lease && opinfo->level <= req_op_level)
- return 1;
+ else if (opinfo->level <= req_op_level) {
+ if (opinfo->is_lease &&
+ opinfo->o_lease->state !=
+ (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_READ_CACHING_LE))
+ return 1;
+ }
}
- if (!opinfo->is_lease && opinfo->level <= req_op_level) {
- wake_up_oplock_break(opinfo);
- return 1;
+ if (opinfo->level <= req_op_level) {
+ if (opinfo->is_lease &&
+ opinfo->o_lease->state !=
+ (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_READ_CACHING_LE)) {
+ wake_up_oplock_break(opinfo);
+ return 1;
+ }
}
return 0;
}
@@ -887,7 +893,6 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
struct lease *lease = brk_opinfo->o_lease;
atomic_inc(&brk_opinfo->breaking_cnt);
-
err = oplock_break_pending(brk_opinfo, req_op_level);
if (err)
return err < 0 ? err : 0;
@@ -1105,7 +1110,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
if (!p_ci)
return;
- read_lock(&p_ci->m_lock);
+ down_read(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
@@ -1123,13 +1128,11 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
continue;
}
- read_unlock(&p_ci->m_lock);
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
opinfo_conn_put(opinfo);
- read_lock(&p_ci->m_lock);
}
}
- read_unlock(&p_ci->m_lock);
+ up_read(&p_ci->m_lock);
ksmbd_inode_put(p_ci);
}
@@ -1150,7 +1153,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
if (!p_ci)
return;
- read_lock(&p_ci->m_lock);
+ down_read(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
@@ -1164,13 +1167,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
atomic_dec(&opinfo->conn->r_count);
continue;
}
- read_unlock(&p_ci->m_lock);
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
opinfo_conn_put(opinfo);
- read_lock(&p_ci->m_lock);
}
}
- read_unlock(&p_ci->m_lock);
+ up_read(&p_ci->m_lock);
ksmbd_inode_put(p_ci);
}
@@ -1200,7 +1201,9 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
/* Only v2 leases handle the directory */
if (S_ISDIR(file_inode(fp->filp)->i_mode)) {
- if (!lctx || lctx->version != 2)
+ if (!lctx || lctx->version != 2 ||
+ (lctx->flags != SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE &&
+ !lctx->epoch))
return 0;
}
@@ -1465,8 +1468,9 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
buf->lcontext.LeaseFlags = lease->flags;
buf->lcontext.Epoch = cpu_to_le16(lease->epoch);
buf->lcontext.LeaseState = lease->state;
- memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
- SMB2_LEASE_KEY_SIZE);
+ if (lease->flags == SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE)
+ memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
+ SMB2_LEASE_KEY_SIZE);
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_lease_v2, lcontext));
buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
@@ -1525,8 +1529,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
lreq->flags = lc->lcontext.LeaseFlags;
lreq->epoch = lc->lcontext.Epoch;
lreq->duration = lc->lcontext.LeaseDuration;
- memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
- SMB2_LEASE_KEY_SIZE);
+ if (lreq->flags == SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE)
+ memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
+ SMB2_LEASE_KEY_SIZE);
lreq->version = 2;
} else {
struct create_lease *lc = (struct create_lease *)cc;
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index c0788188aa82..c67fbc8d6683 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -167,20 +167,17 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
int rc;
bool is_chained = false;
- if (conn->ops->allocate_rsp_buf(work))
- return;
-
if (conn->ops->is_transform_hdr &&
conn->ops->is_transform_hdr(work->request_buf)) {
rc = conn->ops->decrypt_req(work);
- if (rc < 0) {
- conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
- goto send;
- }
-
+ if (rc < 0)
+ return;
work->encrypted = true;
}
+ if (conn->ops->allocate_rsp_buf(work))
+ return;
+
rc = conn->ops->init_rsp_hdr(work);
if (rc) {
/* either uid or tid is not correct */
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 5723bbf372d7..b6c5a8ea3887 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -535,6 +535,10 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
if (cmd == SMB2_QUERY_INFO_HE) {
struct smb2_query_info_req *req;
+ if (get_rfc1002_len(work->request_buf) <
+ offsetof(struct smb2_query_info_req, OutputBufferLength))
+ return -EINVAL;
+
req = smb2_get_msg(work->request_buf);
if ((req->InfoType == SMB2_O_INFO_FILE &&
(req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
@@ -1922,7 +1926,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
struct ksmbd_session *sess = work->sess;
char *treename = NULL, *name = NULL;
struct ksmbd_tree_conn_status status;
- struct ksmbd_share_config *share;
+ struct ksmbd_share_config *share = NULL;
int rc = -EINVAL;
WORK_BUFFERS(work, req, rsp);
@@ -1984,7 +1988,12 @@ int smb2_tree_connect(struct ksmbd_work *work)
write_unlock(&sess->tree_conns_lock);
rsp->StructureSize = cpu_to_le16(16);
out_err1:
- rsp->Capabilities = 0;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE && share &&
+ test_share_config_flag(share,
+ KSMBD_SHARE_FLAG_CONTINUOUS_AVAILABILITY))
+ rsp->Capabilities = SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY;
+ else
+ rsp->Capabilities = 0;
rsp->Reserved = 0;
/* default manual caching */
rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
@@ -3367,9 +3376,9 @@ int smb2_open(struct ksmbd_work *work)
* after daccess, saccess, attrib_only, and stream are
* initialized.
*/
- write_lock(&fp->f_ci->m_lock);
+ down_write(&fp->f_ci->m_lock);
list_add(&fp->node, &fp->f_ci->m_fp_list);
- write_unlock(&fp->f_ci->m_lock);
+ up_write(&fp->f_ci->m_lock);
/* Check delete pending among previous fp before oplock break */
if (ksmbd_inode_pending_delete(fp)) {
@@ -3498,7 +3507,9 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
if (dh_info.type == DURABLE_REQ_V2 || dh_info.type == DURABLE_REQ) {
- if (dh_info.type == DURABLE_REQ_V2 && dh_info.persistent)
+ if (dh_info.type == DURABLE_REQ_V2 && dh_info.persistent &&
+ test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_CONTINUOUS_AVAILABILITY))
fp->is_persistent = true;
else
fp->is_durable = true;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index fcaf373cc008..474dadf6b7b8 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -646,7 +646,7 @@ int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp)
* Lookup fp in master fp list, and check desired access and
* shared mode between previous open and current open.
*/
- read_lock(&curr_fp->f_ci->m_lock);
+ down_read(&curr_fp->f_ci->m_lock);
list_for_each_entry(prev_fp, &curr_fp->f_ci->m_fp_list, node) {
if (file_inode(filp) != file_inode(prev_fp->filp))
continue;
@@ -722,7 +722,7 @@ int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp)
break;
}
}
- read_unlock(&curr_fp->f_ci->m_lock);
+ up_read(&curr_fp->f_ci->m_lock);
return rc;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 002a3f0dc7c5..6633fa78e9b9 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -448,6 +448,10 @@ static int create_socket(struct interface *iface)
sin6.sin6_family = PF_INET6;
sin6.sin6_addr = in6addr_any;
sin6.sin6_port = htons(server_conf.tcp_port);
+
+ lock_sock(ksmbd_socket->sk);
+ ksmbd_socket->sk->sk_ipv6only = false;
+ release_sock(ksmbd_socket->sk);
}
ksmbd_tcp_nodelay(ksmbd_socket);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 22f0f3db3ac9..51b1b0bed616 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -754,10 +754,15 @@ retry:
goto out4;
}
+ /*
+ * explicitly handle file overwrite case, for compatibility with
+ * filesystems that may not support rename flags (e.g: fuse)
+ */
if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
err = -EEXIST;
goto out4;
}
+ flags &= ~(RENAME_NOREPLACE);
if (old_child == trap) {
err = -EINVAL;
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 030f70700036..6cb599cd287e 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -165,7 +165,7 @@ static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
ci->m_fattr = 0;
INIT_LIST_HEAD(&ci->m_fp_list);
INIT_LIST_HEAD(&ci->m_op_list);
- rwlock_init(&ci->m_lock);
+ init_rwsem(&ci->m_lock);
ci->m_de = fp->filp->f_path.dentry;
return 0;
}
@@ -261,14 +261,14 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
}
if (atomic_dec_and_test(&ci->m_count)) {
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
ksmbd_vfs_unlink(filp);
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
}
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
ksmbd_inode_free(ci);
}
@@ -289,9 +289,9 @@ static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp
if (!has_file_id(fp->volatile_id))
return;
- write_lock(&fp->f_ci->m_lock);
+ down_write(&fp->f_ci->m_lock);
list_del_init(&fp->node);
- write_unlock(&fp->f_ci->m_lock);
+ up_write(&fp->f_ci->m_lock);
write_lock(&ft->lock);
idr_remove(ft->idr, fp->volatile_id);
@@ -523,17 +523,17 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry)
if (!ci)
return NULL;
- read_lock(&ci->m_lock);
+ down_read(&ci->m_lock);
list_for_each_entry(lfp, &ci->m_fp_list, node) {
if (inode == file_inode(lfp->filp)) {
atomic_dec(&ci->m_count);
lfp = ksmbd_fp_get(lfp);
- read_unlock(&ci->m_lock);
+ up_read(&ci->m_lock);
return lfp;
}
}
atomic_dec(&ci->m_count);
- read_unlock(&ci->m_lock);
+ up_read(&ci->m_lock);
return NULL;
}
@@ -705,13 +705,13 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon,
conn = fp->conn;
ci = fp->f_ci;
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
if (op->conn != conn)
continue;
op->conn = NULL;
}
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
fp->conn = NULL;
fp->tcon = NULL;
@@ -801,13 +801,13 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
fp->tcon = work->tcon;
ci = fp->f_ci;
- write_lock(&ci->m_lock);
+ down_write(&ci->m_lock);
list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
if (op->conn)
continue;
op->conn = fp->conn;
}
- write_unlock(&ci->m_lock);
+ up_write(&ci->m_lock);
__open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
if (!has_file_id(fp->volatile_id)) {
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index ed44fb4e18e7..5a225e7055f1 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -47,7 +47,7 @@ struct stream {
};
struct ksmbd_inode {
- rwlock_t m_lock;
+ struct rw_semaphore m_lock;
atomic_t m_count;
atomic_t op_count;
/* opinfo count for streams */
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index aa3411354e66..16bd693d0b3a 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
gid_t i_gid;
int err;
+ inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+ if (inode->i_ino == 0)
+ return -EINVAL;
+
err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
if (err)
return err;
@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
- inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
diff --git a/fs/stat.c b/fs/stat.c
index 77cdc69eb422..70bd3e888cfa 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -658,6 +658,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_mnt_id = stat->mnt_id;
tmp.stx_dio_mem_align = stat->dio_mem_align;
tmp.stx_dio_offset_align = stat->dio_offset_align;
+ tmp.stx_subvol = stat->subvol;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6b7652fb8050..7cd64021d453 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -463,6 +463,8 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
kn = kernfs_find_and_get(kobj->sd, attr->name);
if (kn)
kernfs_break_active_protection(kn);
+ else
+ kobject_put(kobj);
return kn;
}
EXPORT_SYMBOL_GPL(sysfs_break_active_protection);
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index dc067eeb6387..a878cea70f4c 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -37,6 +37,7 @@ static DEFINE_MUTEX(eventfs_mutex);
struct eventfs_root_inode {
struct eventfs_inode ei;
+ struct inode *parent_inode;
struct dentry *events_dir;
};
@@ -68,11 +69,25 @@ enum {
EVENTFS_SAVE_MODE = BIT(16),
EVENTFS_SAVE_UID = BIT(17),
EVENTFS_SAVE_GID = BIT(18),
- EVENTFS_TOPLEVEL = BIT(19),
};
#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
+static void free_ei_rcu(struct rcu_head *rcu)
+{
+ struct eventfs_inode *ei = container_of(rcu, struct eventfs_inode, rcu);
+ struct eventfs_root_inode *rei;
+
+ kfree(ei->entry_attrs);
+ kfree_const(ei->name);
+ if (ei->is_events) {
+ rei = get_root_inode(ei);
+ kfree(rei);
+ } else {
+ kfree(ei);
+ }
+}
+
/*
* eventfs_inode reference count management.
*
@@ -84,18 +99,17 @@ enum {
static void release_ei(struct kref *ref)
{
struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
- struct eventfs_root_inode *rei;
+ const struct eventfs_entry *entry;
WARN_ON_ONCE(!ei->is_freed);
- kfree(ei->entry_attrs);
- kfree_const(ei->name);
- if (ei->is_events) {
- rei = get_root_inode(ei);
- kfree_rcu(rei, ei.rcu);
- } else {
- kfree_rcu(ei, rcu);
+ for (int i = 0; i < ei->nr_entries; i++) {
+ entry = &ei->entries[i];
+ if (entry->release)
+ entry->release(entry->name, ei->data);
}
+
+ call_rcu(&ei->rcu, free_ei_rcu);
}
static inline void put_ei(struct eventfs_inode *ei)
@@ -112,6 +126,18 @@ static inline void free_ei(struct eventfs_inode *ei)
}
}
+/*
+ * Called when creation of an ei fails, do not call release() functions.
+ */
+static inline void cleanup_ei(struct eventfs_inode *ei)
+{
+ if (ei) {
+ /* Set nr_entries to 0 to prevent release() function being called */
+ ei->nr_entries = 0;
+ free_ei(ei);
+ }
+}
+
static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
{
if (ei)
@@ -181,21 +207,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* determined by the parent directory.
*/
if (dentry->d_inode->i_mode & S_IFDIR) {
- /*
- * The events directory dentry is never freed, unless its
- * part of an instance that is deleted. It's attr is the
- * default for its child files and directories.
- * Do not update it. It's not used for its own mode or ownership.
- */
- if (ei->is_events) {
- /* But it still needs to know if it was modified */
- if (iattr->ia_valid & ATTR_UID)
- ei->attr.mode |= EVENTFS_SAVE_UID;
- if (iattr->ia_valid & ATTR_GID)
- ei->attr.mode |= EVENTFS_SAVE_GID;
- } else {
- update_attr(&ei->attr, iattr);
- }
+ update_attr(&ei->attr, iattr);
} else {
name = dentry->d_name.name;
@@ -213,18 +225,25 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
-static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb)
+static void update_events_attr(struct eventfs_inode *ei, struct super_block *sb)
{
- struct inode *root;
+ struct eventfs_root_inode *rei;
+ struct inode *parent;
- /* Only update if the "events" was on the top level */
- if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL))
- return;
+ rei = get_root_inode(ei);
+
+ /* Use the parent inode permissions unless root set its permissions */
+ parent = rei->parent_inode;
- /* Get the tracefs root inode. */
- root = d_inode(sb->s_root);
- ei->attr.uid = root->i_uid;
- ei->attr.gid = root->i_gid;
+ if (rei->ei.attr.mode & EVENTFS_SAVE_UID)
+ ei->attr.uid = rei->ei.attr.uid;
+ else
+ ei->attr.uid = parent->i_uid;
+
+ if (rei->ei.attr.mode & EVENTFS_SAVE_GID)
+ ei->attr.gid = rei->ei.attr.gid;
+ else
+ ei->attr.gid = parent->i_gid;
}
static void set_top_events_ownership(struct inode *inode)
@@ -233,10 +252,10 @@ static void set_top_events_ownership(struct inode *inode)
struct eventfs_inode *ei = ti->private;
/* The top events directory doesn't get automatically updated */
- if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ if (!ei || !ei->is_events)
return;
- update_top_events_attr(ei, inode->i_sb);
+ update_events_attr(ei, inode->i_sb);
if (!(ei->attr.mode & EVENTFS_SAVE_UID))
inode->i_uid = ei->attr.uid;
@@ -265,7 +284,7 @@ static int eventfs_permission(struct mnt_idmap *idmap,
return generic_permission(idmap, inode, mask);
}
-static const struct inode_operations eventfs_root_dir_inode_operations = {
+static const struct inode_operations eventfs_dir_inode_operations = {
.lookup = eventfs_root_lookup,
.setattr = eventfs_set_attr,
.getattr = eventfs_get_attr,
@@ -282,6 +301,35 @@ static const struct file_operations eventfs_file_operations = {
.llseek = generic_file_llseek,
};
+/*
+ * On a remount of tracefs, if UID or GID options are set, then
+ * the mount point inode permissions should be used.
+ * Reset the saved permission flags appropriately.
+ */
+void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid)
+{
+ struct eventfs_inode *ei = ti->private;
+
+ if (!ei)
+ return;
+
+ if (update_uid)
+ ei->attr.mode &= ~EVENTFS_SAVE_UID;
+
+ if (update_gid)
+ ei->attr.mode &= ~EVENTFS_SAVE_GID;
+
+ if (!ei->entry_attrs)
+ return;
+
+ for (int i = 0; i < ei->nr_entries; i++) {
+ if (update_uid)
+ ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_UID;
+ if (update_gid)
+ ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_GID;
+ }
+}
+
/* Return the evenfs_inode of the "events" directory */
static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
{
@@ -304,7 +352,7 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
// Walk upwards until you find the events inode
} while (!ei->is_events);
- update_top_events_attr(ei, dentry->d_sb);
+ update_events_attr(ei, dentry->d_sb);
return ei;
}
@@ -336,6 +384,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
/**
* lookup_file - look up a file in the tracefs filesystem
+ * @parent_ei: Pointer to the eventfs_inode that represents parent of the file
* @dentry: the dentry to look up
* @mode: the permission that the file should have.
* @attr: saved attributes changed by user
@@ -389,6 +438,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
/**
* lookup_dir_entry - look up a dir in the tracefs filesystem
* @dentry: the directory to look up
+ * @pei: Pointer to the parent eventfs_inode if available
* @ei: the eventfs_inode that represents the directory to create
*
* This function will look up a dentry for a directory represented by
@@ -408,7 +458,7 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry,
update_inode_attr(dentry, inode, &ei->attr,
S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
- inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_op = &eventfs_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
/* All directories will have the same inode number */
@@ -478,16 +528,20 @@ void eventfs_d_release(struct dentry *dentry)
/**
* lookup_file_dentry - create a dentry for a file of an eventfs_inode
+ * @dentry: The parent dentry under which the new file's dentry will be created
* @ei: the eventfs_inode that the file will be created under
* @idx: the index into the entry_attrs[] of the @ei
- * @parent: The parent dentry of the created file.
- * @name: The name of the file to create
* @mode: The mode of the file.
* @data: The data to use to set the inode of the file with on open()
* @fops: The fops of the file to be created.
*
- * Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry.
+ * This function creates a dentry for a file associated with an
+ * eventfs_inode @ei. It uses the entry attributes specified by @idx,
+ * if available. The file will have the specified @mode and its inode will be
+ * set up with @data upon open. The file operations will be set to @fops.
+ *
+ * Return: Returns a pointer to the newly created file's dentry or an error
+ * pointer.
*/
static struct dentry *
lookup_file_dentry(struct dentry *dentry,
@@ -728,7 +782,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
/* Was the parent freed? */
if (list_empty(&ei->list)) {
- free_ei(ei);
+ cleanup_ei(ei);
ei = NULL;
}
return ei;
@@ -775,6 +829,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
// Note: we have a ref to the dentry from tracefs_start_creating()
rei = get_root_inode(ei);
rei->events_dir = dentry;
+ rei->parent_inode = d_inode(dentry->d_sb->s_root);
ei->entries = entries;
ei->nr_entries = size;
@@ -784,29 +839,26 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
- /*
- * If the events directory is of the top instance, then parent
- * is NULL. Set the attr.mode to reflect this and its permissions will
- * default to the tracefs root dentry.
- */
- if (!parent)
- ei->attr.mode = EVENTFS_TOPLEVEL;
-
- /* This is used as the default ownership of the files and directories */
ei->attr.uid = uid;
ei->attr.gid = gid;
+ /*
+ * When the "events" directory is created, it takes on the
+ * permissions of its parent. But can be reset on remount.
+ */
+ ei->attr.mode |= EVENTFS_SAVE_UID | EVENTFS_SAVE_GID;
+
INIT_LIST_HEAD(&ei->children);
INIT_LIST_HEAD(&ei->list);
ti = get_tracefs(inode);
- ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
+ ti->flags |= TRACEFS_EVENT_INODE;
ti->private = ei;
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_op = &eventfs_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
dentry->d_fsdata = get_ei(ei);
@@ -829,7 +881,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
return ei;
fail:
- free_ei(ei);
+ cleanup_ei(ei);
tracefs_failed_creating(dentry);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 5545e6bf7d26..a827f6a716c4 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -11,14 +11,14 @@
#include <linux/module.h>
#include <linux/fs.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/tracefs.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include "internal.h"
@@ -30,20 +30,47 @@ static struct vfsmount *tracefs_mount;
static int tracefs_mount_count;
static bool tracefs_registered;
+/*
+ * Keep track of all tracefs_inodes in order to update their
+ * flags if necessary on a remount.
+ */
+static DEFINE_SPINLOCK(tracefs_inode_lock);
+static LIST_HEAD(tracefs_inodes);
+
static struct inode *tracefs_alloc_inode(struct super_block *sb)
{
struct tracefs_inode *ti;
+ unsigned long flags;
ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
if (!ti)
return NULL;
+ spin_lock_irqsave(&tracefs_inode_lock, flags);
+ list_add_rcu(&ti->list, &tracefs_inodes);
+ spin_unlock_irqrestore(&tracefs_inode_lock, flags);
+
return &ti->vfs_inode;
}
+static void tracefs_free_inode_rcu(struct rcu_head *rcu)
+{
+ struct tracefs_inode *ti;
+
+ ti = container_of(rcu, struct tracefs_inode, rcu);
+ kmem_cache_free(tracefs_inode_cachep, ti);
+}
+
static void tracefs_free_inode(struct inode *inode)
{
- kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode));
+ struct tracefs_inode *ti = get_tracefs(inode);
+ unsigned long flags;
+
+ spin_lock_irqsave(&tracefs_inode_lock, flags);
+ list_del_rcu(&ti->list);
+ spin_unlock_irqrestore(&tracefs_inode_lock, flags);
+
+ call_rcu(&ti->rcu, tracefs_free_inode_rcu);
}
static ssize_t default_read_file(struct file *file, char __user *buf,
@@ -153,16 +180,39 @@ static void set_tracefs_inode_owner(struct inode *inode)
{
struct tracefs_inode *ti = get_tracefs(inode);
struct inode *root_inode = ti->private;
+ kuid_t uid;
+ kgid_t gid;
+
+ uid = root_inode->i_uid;
+ gid = root_inode->i_gid;
+
+ /*
+ * If the root is not the mount point, then check the root's
+ * permissions. If it was never set, then default to the
+ * mount point.
+ */
+ if (root_inode != d_inode(root_inode->i_sb->s_root)) {
+ struct tracefs_inode *rti;
+
+ rti = get_tracefs(root_inode);
+ root_inode = d_inode(root_inode->i_sb->s_root);
+
+ if (!(rti->flags & TRACEFS_UID_PERM_SET))
+ uid = root_inode->i_uid;
+
+ if (!(rti->flags & TRACEFS_GID_PERM_SET))
+ gid = root_inode->i_gid;
+ }
/*
* If this inode has never been referenced, then update
* the permissions to the superblock.
*/
if (!(ti->flags & TRACEFS_UID_PERM_SET))
- inode->i_uid = root_inode->i_uid;
+ inode->i_uid = uid;
if (!(ti->flags & TRACEFS_GID_PERM_SET))
- inode->i_gid = root_inode->i_gid;
+ inode->i_gid = gid;
}
static int tracefs_permission(struct mnt_idmap *idmap,
@@ -231,7 +281,7 @@ struct inode *tracefs_get_inode(struct super_block *sb)
return inode;
}
-struct tracefs_mount_opts {
+struct tracefs_fs_info {
kuid_t uid;
kgid_t gid;
umode_t mode;
@@ -243,68 +293,51 @@ enum {
Opt_uid,
Opt_gid,
Opt_mode,
- Opt_err
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec tracefs_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
};
-struct tracefs_fs_info {
- struct tracefs_mount_opts mount_opts;
-};
-
-static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- substring_t args[MAX_OPT_ARGS];
- int option;
- int token;
+ struct tracefs_fs_info *opts = fc->s_fs_info;
+ struct fs_parse_result result;
kuid_t uid;
kgid_t gid;
- char *p;
-
- opts->opts = 0;
- opts->mode = TRACEFS_DEFAULT_MODE;
-
- while ((p = strsep(&data, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return -EINVAL;
- opts->uid = uid;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return -EINVAL;
- opts->gid = gid;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
- /*
- * We might like to report bad mount options here;
- * but traditionally tracefs has ignored all mount options
- */
- }
-
- opts->opts |= BIT(token);
+ int opt;
+
+ opt = fs_parse(fc, tracefs_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return invalf(fc, "Unknown uid");
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return invalf(fc, "Unknown gid");
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally tracefs has ignored all mount options
+ */
}
+ opts->opts |= BIT(opt);
+
return 0;
}
@@ -312,7 +345,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
- struct tracefs_mount_opts *opts = &fsi->mount_opts;
+ struct tracefs_inode *ti;
+ bool update_uid, update_gid;
umode_t tmp_mode;
/*
@@ -320,50 +354,65 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
* options.
*/
- if (!remount || opts->opts & BIT(Opt_mode)) {
+ if (!remount || fsi->opts & BIT(Opt_mode)) {
tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO;
- tmp_mode |= opts->mode;
+ tmp_mode |= fsi->mode;
WRITE_ONCE(inode->i_mode, tmp_mode);
}
- if (!remount || opts->opts & BIT(Opt_uid))
- inode->i_uid = opts->uid;
+ if (!remount || fsi->opts & BIT(Opt_uid))
+ inode->i_uid = fsi->uid;
+
+ if (!remount || fsi->opts & BIT(Opt_gid))
+ inode->i_gid = fsi->gid;
+
+ if (remount && (fsi->opts & BIT(Opt_uid) || fsi->opts & BIT(Opt_gid))) {
- if (!remount || opts->opts & BIT(Opt_gid))
- inode->i_gid = opts->gid;
+ update_uid = fsi->opts & BIT(Opt_uid);
+ update_gid = fsi->opts & BIT(Opt_gid);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ti, &tracefs_inodes, list) {
+ if (update_uid)
+ ti->flags &= ~TRACEFS_UID_PERM_SET;
+
+ if (update_gid)
+ ti->flags &= ~TRACEFS_GID_PERM_SET;
+
+ if (ti->flags & TRACEFS_EVENT_INODE)
+ eventfs_remount(ti, update_uid, update_gid);
+ }
+ rcu_read_unlock();
+ }
return 0;
}
-static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+static int tracefs_reconfigure(struct fs_context *fc)
{
- int err;
- struct tracefs_fs_info *fsi = sb->s_fs_info;
+ struct super_block *sb = fc->root->d_sb;
+ struct tracefs_fs_info *sb_opts = sb->s_fs_info;
+ struct tracefs_fs_info *new_opts = fc->s_fs_info;
sync_filesystem(sb);
- err = tracefs_parse_options(data, &fsi->mount_opts);
- if (err)
- goto fail;
-
- tracefs_apply_options(sb, true);
+ /* structure copy of new mount options to sb */
+ *sb_opts = *new_opts;
-fail:
- return err;
+ return tracefs_apply_options(sb, true);
}
static int tracefs_show_options(struct seq_file *m, struct dentry *root)
{
struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
- struct tracefs_mount_opts *opts = &fsi->mount_opts;
- if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID))
seq_printf(m, ",uid=%u",
- from_kuid_munged(&init_user_ns, opts->uid));
- if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ from_kuid_munged(&init_user_ns, fsi->uid));
+ if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
- from_kgid_munged(&init_user_ns, opts->gid));
- if (opts->mode != TRACEFS_DEFAULT_MODE)
- seq_printf(m, ",mode=%o", opts->mode);
+ from_kgid_munged(&init_user_ns, fsi->gid));
+ if (fsi->mode != TRACEFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", fsi->mode);
return 0;
}
@@ -373,7 +422,6 @@ static const struct super_operations tracefs_super_operations = {
.free_inode = tracefs_free_inode,
.drop_inode = generic_delete_inode,
.statfs = simple_statfs,
- .remount_fs = tracefs_remount,
.show_options = tracefs_show_options,
};
@@ -398,31 +446,34 @@ static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
return !(ei && ei->is_freed);
}
+static void tracefs_d_iput(struct dentry *dentry, struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+
+ /*
+ * This inode is being freed and cannot be used for
+ * eventfs. Clear the flag so that it doesn't call into
+ * eventfs during the remount flag updates. The eventfs_inode
+ * gets freed after an RCU cycle, so the content will still
+ * be safe if the iteration is going on now.
+ */
+ ti->flags &= ~TRACEFS_EVENT_INODE;
+}
+
static const struct dentry_operations tracefs_dentry_operations = {
+ .d_iput = tracefs_d_iput,
.d_revalidate = tracefs_d_revalidate,
.d_release = tracefs_d_release,
};
-static int trace_fill_super(struct super_block *sb, void *data, int silent)
+static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr trace_files[] = {{""}};
- struct tracefs_fs_info *fsi;
int err;
- fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
- sb->s_fs_info = fsi;
- if (!fsi) {
- err = -ENOMEM;
- goto fail;
- }
-
- err = tracefs_parse_options(data, &fsi->mount_opts);
+ err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
if (err)
- goto fail;
-
- err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
- if (err)
- goto fail;
+ return err;
sb->s_op = &tracefs_super_operations;
sb->s_d_op = &tracefs_dentry_operations;
@@ -430,24 +481,45 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
tracefs_apply_options(sb, false);
return 0;
+}
-fail:
- kfree(fsi);
- sb->s_fs_info = NULL;
- return err;
+static int tracefs_get_tree(struct fs_context *fc)
+{
+ return get_tree_single(fc, tracefs_fill_super);
+}
+
+static void tracefs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
}
-static struct dentry *trace_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static const struct fs_context_operations tracefs_context_ops = {
+ .free = tracefs_free_fc,
+ .parse_param = tracefs_parse_param,
+ .get_tree = tracefs_get_tree,
+ .reconfigure = tracefs_reconfigure,
+};
+
+static int tracefs_init_fs_context(struct fs_context *fc)
{
- return mount_single(fs_type, flags, data, trace_fill_super);
+ struct tracefs_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ fsi->mode = TRACEFS_DEFAULT_MODE;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &tracefs_context_ops;
+ return 0;
}
static struct file_system_type trace_fs_type = {
.owner = THIS_MODULE,
.name = "tracefs",
- .mount = trace_mount,
+ .init_fs_context = tracefs_init_fs_context,
+ .parameters = tracefs_param_specs,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("tracefs");
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 15c26f9aaad4..f704d8348357 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -4,15 +4,18 @@
enum {
TRACEFS_EVENT_INODE = BIT(1),
- TRACEFS_EVENT_TOP_INODE = BIT(2),
- TRACEFS_GID_PERM_SET = BIT(3),
- TRACEFS_UID_PERM_SET = BIT(4),
- TRACEFS_INSTANCE_INODE = BIT(5),
+ TRACEFS_GID_PERM_SET = BIT(2),
+ TRACEFS_UID_PERM_SET = BIT(3),
+ TRACEFS_INSTANCE_INODE = BIT(4),
};
struct tracefs_inode {
- struct inode vfs_inode;
+ union {
+ struct inode vfs_inode;
+ struct rcu_head rcu;
+ };
/* The below gets initialized with memset_after(ti, 0, vfs_inode) */
+ struct list_head list;
unsigned long flags;
void *private;
};
@@ -73,6 +76,7 @@ struct dentry *tracefs_end_creating(struct dentry *dentry);
struct dentry *tracefs_failed_creating(struct dentry *dentry);
struct inode *tracefs_get_inode(struct super_block *sb);
+void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid);
void eventfs_d_release(struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 6d963402c835..2a564f813314 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -896,6 +896,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
continue;
}
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
vma->vm_end, new_flags,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 632653e00906..2ce302b4885f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1230,8 +1230,7 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return generic_file_open(inode, file);
}
@@ -1244,7 +1243,9 @@ xfs_dir_open(
unsigned int mode;
int error;
- error = xfs_file_open(inode, file);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+ error = generic_file_open(inode, file);
if (error)
return error;
@@ -1490,7 +1491,6 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
@@ -1498,6 +1498,8 @@ const struct file_operations xfs_file_operations = {
.fallocate = xfs_file_fallocate,
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index c6a124e8d565..964fa7f24003 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1048,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb,
zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
zonefs_zgroup_name(ztype),
zgroup->g_nr_zones,
- zgroup->g_nr_zones > 1 ? "s" : "");
+ str_plural(zgroup->g_nr_zones));
return 0;
}