summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 22:14:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 22:14:03 +0300
commitef31ea6c2774c015946d2ffa26795766f7caaa42 (patch)
tree5dfd244bda421d16625d1a4ded02c41dcae71b19 /fs
parent103fb219cf57fc3641d92af2f4f438080cea3efc (diff)
parente2bc9f6cfbd62c72a93a70068daab8886bec32ce (diff)
downloadlinux-ef31ea6c2774c015946d2ffa26795766f7caaa42.tar.xz
Merge tag 'vfs-6.10.netfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull netfs updates from Christian Brauner: "This reworks the netfslib writeback implementation so that pages read from the cache are written to the cache through ->writepages(), thereby allowing the fscache page flag to be retired. The reworking also: - builds on top of the new writeback_iter() infrastructure - makes it possible to use vectored write RPCs as discontiguous streams of pages can be accommodated - makes it easier to do simultaneous content crypto and stream division - provides support for retrying writes and re-dividing a stream - replaces the ->launder_folio() op, so that ->writepages() is used instead - uses mempools to allocate the netfs_io_request and netfs_io_subrequest structs to avoid allocation failure in the writeback path Some code that uses the fscache page flag is retained for compatibility purposes with nfs and ceph. The code is switched to using the synonymous private_2 label instead and marked with deprecation comments. The merge commit contains additional details on the new algorithm that I've left out of here as it would probably be excessively detailed. On top of the netfslib infrastructure this contains the work to convert cifs over to netfslib" * tag 'vfs-6.10.netfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (38 commits) cifs: Enable large folio support cifs: Remove some code that's no longer used, part 3 cifs: Remove some code that's no longer used, part 2 cifs: Remove some code that's no longer used, part 1 cifs: Cut over to using netfslib cifs: Implement netfslib hooks cifs: Make add_credits_and_wake_if() clear deducted credits cifs: Add mempools for cifs_io_request and cifs_io_subrequest structs cifs: Set zero_point in the copy_file_range() and remap_file_range() cifs: Move cifs_loose_read_iter() and cifs_file_write_iter() to file.c cifs: Replace the writedata replay bool with a netfs sreq flag cifs: Make wait_mtu_credits take size_t args cifs: Use more fields from netfs_io_subrequest cifs: Replace cifs_writedata with a wrapper around netfs_io_subrequest cifs: Replace cifs_readdata with a wrapper around netfs_io_subrequest cifs: Use alternative invalidation to using launder_folio netfs, afs: Use writeback retry to deal with alternate keys netfs: Miscellaneous tidy ups netfs: Remove the old writeback code netfs: Cut over to using new writeback code ...
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c60
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/validation.c4
-rw-r--r--fs/afs/write.c189
-rw-r--r--fs/cachefiles/io.c76
-rw-r--r--fs/ceph/addr.c24
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/netfs/Makefile3
-rw-r--r--fs/netfs/buffered_read.c40
-rw-r--r--fs/netfs/buffered_write.c829
-rw-r--r--fs/netfs/direct_write.c56
-rw-r--r--fs/netfs/fscache_io.c14
-rw-r--r--fs/netfs/internal.h55
-rw-r--r--fs/netfs/io.c162
-rw-r--r--fs/netfs/main.c55
-rw-r--r--fs/netfs/misc.c10
-rw-r--r--fs/netfs/objects.c81
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c17
-rw-r--r--fs/netfs/write_collect.c808
-rw-r--r--fs/netfs/write_issue.c684
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/smb/client/Kconfig1
-rw-r--r--fs/smb/client/cifsfs.c124
-rw-r--r--fs/smb/client/cifsfs.h11
-rw-r--r--fs/smb/client/cifsglob.h65
-rw-r--r--fs/smb/client/cifsproto.h12
-rw-r--r--fs/smb/client/cifssmb.c120
-rw-r--r--fs/smb/client/file.c2720
-rw-r--r--fs/smb/client/fscache.c109
-rw-r--r--fs/smb/client/fscache.h54
-rw-r--r--fs/smb/client/inode.c45
-rw-r--r--fs/smb/client/smb2ops.c10
-rw-r--r--fs/smb/client/smb2pdu.c186
-rw-r--r--fs/smb/client/smb2proto.h5
-rw-r--r--fs/smb/client/trace.h144
-rw-r--r--fs/smb/client/transport.c17
40 files changed, 2835 insertions, 4467 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 047855033d32..a97ceb105cd8 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -26,36 +26,38 @@
#include "cache.h"
#include "fid.h"
-static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+/*
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+static void v9fs_begin_writeback(struct netfs_io_request *wreq)
{
- struct p9_fid *fid = subreq->rreq->netfs_priv;
- int err, len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
- netfs_write_subrequest_terminated(subreq, len ?: err, false);
-}
+ struct p9_fid *fid;
-static void v9fs_upload_to_server_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
+ fid = v9fs_fid_find_inode(wreq->inode, true, INVALID_UID, true);
+ if (!fid) {
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ wreq->inode->i_ino);
+ return;
+ }
- v9fs_upload_to_server(subreq);
+ wreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+ if (fid->iounit)
+ wreq->wsize = min(wreq->wsize, fid->iounit);
+ wreq->netfs_priv = fid;
+ wreq->io_streams[0].avail = true;
}
/*
- * Set up write requests for a writeback slice. We need to add a write request
- * for each write we want to make.
+ * Issue a subrequest to write to the server.
*/
-static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct netfs_io_subrequest *subreq;
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
- subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
- start, len, v9fs_upload_to_server_worker);
- if (subreq)
- netfs_queue_write_request(subreq);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
}
/**
@@ -87,12 +89,16 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct p9_fid *fid;
bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
- rreq->origin == NETFS_WRITEBACK ||
rreq->origin == NETFS_WRITETHROUGH ||
- rreq->origin == NETFS_LAUNDER_WRITE ||
rreq->origin == NETFS_UNBUFFERED_WRITE ||
rreq->origin == NETFS_DIO_WRITE);
+ if (rreq->origin == NETFS_WRITEBACK)
+ return 0; /* We don't get the write handle until we find we
+ * have actually dirty data and not just
+ * copy-to-cache data.
+ */
+
if (file) {
fid = file->private_data;
if (!fid)
@@ -104,6 +110,10 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
goto no_fid;
}
+ rreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+ if (fid->iounit)
+ rreq->wsize = min(rreq->wsize, fid->iounit);
+
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
@@ -132,7 +142,8 @@ const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
.issue_read = v9fs_issue_read,
- .create_write_requests = v9fs_create_write_requests,
+ .begin_writeback = v9fs_begin_writeback,
+ .issue_write = v9fs_issue_write,
};
const struct address_space_operations v9fs_addr_operations = {
@@ -141,7 +152,6 @@ const struct address_space_operations v9fs_addr_operations = {
.dirty_folio = netfs_dirty_folio,
.release_folio = netfs_release_folio,
.invalidate_folio = netfs_invalidate_folio,
- .launder_folio = netfs_launder_folio,
.direct_IO = noop_direct_IO,
.writepages = netfs_writepages,
};
diff --git a/fs/afs/file.c b/fs/afs/file.c
index ef2cc8f565d2..c3f0c45ae9a9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -54,7 +54,6 @@ const struct address_space_operations afs_file_aops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = netfs_dirty_folio,
- .launder_folio = netfs_launder_folio,
.release_folio = netfs_release_folio,
.invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
@@ -354,7 +353,7 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
if (file)
rreq->netfs_priv = key_get(afs_file_key(file));
rreq->rsize = 256 * 1024;
- rreq->wsize = 256 * 1024;
+ rreq->wsize = 256 * 1024 * 1024;
return 0;
}
@@ -369,6 +368,7 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
static void afs_free_request(struct netfs_io_request *rreq)
{
key_put(rreq->netfs_priv);
+ afs_put_wb_key(rreq->netfs_priv2);
}
static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
@@ -400,7 +400,9 @@ const struct netfs_request_ops afs_req_ops = {
.issue_read = afs_issue_read,
.update_i_size = afs_update_i_size,
.invalidate_cache = afs_netfs_invalidate_cache,
- .create_write_requests = afs_create_write_requests,
+ .begin_writeback = afs_begin_writeback,
+ .prepare_write = afs_prepare_write,
+ .issue_write = afs_issue_write,
};
static void afs_add_open_mmap(struct afs_vnode *vnode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ce5a612937c..6e1d3c4daf72 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -916,7 +916,6 @@ struct afs_operation {
loff_t pos;
loff_t size;
loff_t i_size;
- bool laundering; /* Laundering page, PG_writeback not set */
} store;
struct {
struct iattr *attr;
@@ -1599,11 +1598,14 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
+void afs_prepare_write(struct netfs_io_subrequest *subreq);
+void afs_issue_write(struct netfs_io_subrequest *subreq);
+void afs_begin_writeback(struct netfs_io_request *wreq);
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 32a53fc8dfb2..bef8af12ebe2 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -365,9 +365,9 @@ static void afs_zap_data(struct afs_vnode *vnode)
* written back in a regular file and completely discard the pages in a
* directory or symlink */
if (S_ISREG(vnode->netfs.inode.i_mode))
- invalidate_remote_inode(&vnode->netfs.inode);
+ filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
else
- invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
+ filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
}
/*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 74402d95a884..e959640694c2 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -29,43 +29,39 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign
/*
* Find a key to use for the writeback. We cached the keys used to author the
- * writes on the vnode. *_wbk will contain the last writeback key used or NULL
- * and we need to start from there if it's set.
+ * writes on the vnode. wreq->netfs_priv2 will contain the last writeback key
+ * record used or NULL and we need to start from there if it's set.
+ * wreq->netfs_priv will be set to the key itself or NULL.
*/
-static int afs_get_writeback_key(struct afs_vnode *vnode,
- struct afs_wb_key **_wbk)
+static void afs_get_writeback_key(struct netfs_io_request *wreq)
{
- struct afs_wb_key *wbk = NULL;
- struct list_head *p;
- int ret = -ENOKEY, ret2;
+ struct afs_wb_key *wbk, *old = wreq->netfs_priv2;
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+
+ key_put(wreq->netfs_priv);
+ wreq->netfs_priv = NULL;
+ wreq->netfs_priv2 = NULL;
spin_lock(&vnode->wb_lock);
- if (*_wbk)
- p = (*_wbk)->vnode_link.next;
+ if (old)
+ wbk = list_next_entry(old, vnode_link);
else
- p = vnode->wb_keys.next;
+ wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link);
- while (p != &vnode->wb_keys) {
- wbk = list_entry(p, struct afs_wb_key, vnode_link);
+ list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) {
_debug("wbk %u", key_serial(wbk->key));
- ret2 = key_validate(wbk->key);
- if (ret2 == 0) {
+ if (key_validate(wbk->key) == 0) {
refcount_inc(&wbk->usage);
+ wreq->netfs_priv = key_get(wbk->key);
+ wreq->netfs_priv2 = wbk;
_debug("USE WB KEY %u", key_serial(wbk->key));
break;
}
-
- wbk = NULL;
- if (ret == -ENOKEY)
- ret = ret2;
- p = p->next;
}
spin_unlock(&vnode->wb_lock);
- if (*_wbk)
- afs_put_wb_key(*_wbk);
- *_wbk = wbk;
- return 0;
+
+ afs_put_wb_key(old);
}
static void afs_store_data_success(struct afs_operation *op)
@@ -75,8 +71,7 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
if (!afs_op_error(op)) {
- if (!op->store.laundering)
- afs_pages_written_back(vnode, op->store.pos, op->store.size);
+ afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes);
}
@@ -89,113 +84,125 @@ static const struct afs_operation_ops afs_store_data_operation = {
};
/*
- * write to a file
+ * Prepare a subrequest to write to the server. This sets the max_len
+ * parameter.
+ */
+void afs_prepare_write(struct netfs_io_subrequest *subreq)
+{
+ //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
+ // subreq->max_len = 512 * 1024;
+ //else
+ subreq->max_len = 256 * 1024 * 1024;
+}
+
+/*
+ * Issue a subrequest to write to the server.
*/
-static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
- bool laundering)
+static void afs_issue_write_worker(struct work_struct *work)
{
+ struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
+ struct netfs_io_request *wreq = subreq->rreq;
struct afs_operation *op;
- struct afs_wb_key *wbk = NULL;
- loff_t size = iov_iter_count(iter);
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+ unsigned long long pos = subreq->start + subreq->transferred;
+ size_t len = subreq->len - subreq->transferred;
int ret = -ENOKEY;
- _enter("%s{%llx:%llu.%u},%llx,%llx",
+ _enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx",
+ wreq->debug_id, subreq->debug_index,
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- size, pos);
+ pos, len);
- ret = afs_get_writeback_key(vnode, &wbk);
- if (ret) {
- _leave(" = %d [no keys]", ret);
- return ret;
- }
+#if 0 // Error injection
+ if (subreq->debug_index == 3)
+ return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
- op = afs_alloc_operation(wbk->key, vnode->volume);
- if (IS_ERR(op)) {
- afs_put_wb_key(wbk);
- return -ENOMEM;
+ if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
}
+#endif
+
+ op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
+ if (IS_ERR(op))
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
afs_op_set_vnode(op, 0, vnode);
- op->file[0].dv_delta = 1;
+ op->file[0].dv_delta = 1;
op->file[0].modification = true;
- op->store.pos = pos;
- op->store.size = size;
- op->store.laundering = laundering;
- op->flags |= AFS_OPERATION_UNINTR;
- op->ops = &afs_store_data_operation;
+ op->store.pos = pos;
+ op->store.size = len;
+ op->flags |= AFS_OPERATION_UNINTR;
+ op->ops = &afs_store_data_operation;
-try_next_key:
afs_begin_vnode_operation(op);
- op->store.write_iter = iter;
- op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
- op->mtime = inode_get_mtime(&vnode->netfs.inode);
+ op->store.write_iter = &subreq->io_iter;
+ op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size);
+ op->mtime = inode_get_mtime(&vnode->netfs.inode);
afs_wait_for_operation(op);
-
- switch (afs_op_error(op)) {
+ ret = afs_put_operation(op);
+ switch (ret) {
case -EACCES:
case -EPERM:
case -ENOKEY:
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
- _debug("next");
-
- ret = afs_get_writeback_key(vnode, &wbk);
- if (ret == 0) {
- key_put(op->key);
- op->key = key_get(wbk->key);
- goto try_next_key;
- }
+ /* If there are more keys we can try, use the retry algorithm
+ * to rotate the keys.
+ */
+ if (wreq->netfs_priv2)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
break;
}
- afs_put_wb_key(wbk);
- _leave(" = %d", afs_op_error(op));
- return afs_put_operation(op);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
}
-static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
+void afs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
- ssize_t ret;
-
- _enter("%x[%x],%zx",
- subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
- subreq->rreq->origin == NETFS_LAUNDER_WRITE);
- netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
- false);
+ subreq->work.func = afs_issue_write_worker;
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ WARN_ON_ONCE(1);
}
-static void afs_upload_to_server_worker(struct work_struct *work)
+/*
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+void afs_begin_writeback(struct netfs_io_request *wreq)
{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- afs_upload_to_server(subreq);
+ afs_get_writeback_key(wreq);
+ wreq->io_streams[0].avail = true;
}
/*
- * Set up write requests for a writeback slice. We need to add a write request
- * for each write we want to make.
+ * Prepare to retry the writes in request. Use this to try rotating the
+ * available writeback keys.
*/
-void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream)
{
- struct netfs_io_subrequest *subreq;
-
- _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
+ struct netfs_io_subrequest *subreq =
+ list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
- subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
- start, len, afs_upload_to_server_worker);
- if (subreq)
- netfs_queue_write_request(subreq);
+ switch (subreq->error) {
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ afs_get_writeback_key(wreq);
+ if (!wreq->netfs_priv)
+ stream->failed = true;
+ break;
+ }
}
/*
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 1d685357e67f..e667dbcd20e8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/bio.h>
#include <linux/falloc.h>
#include <linux/sched/mm.h>
#include <trace/events/fscache.h>
@@ -493,7 +494,7 @@ out_no_object:
* boundary as appropriate.
*/
static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
+ unsigned long long i_size)
{
return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
subreq->start, &subreq->len, i_size,
@@ -622,6 +623,77 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
return ret;
}
+static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
+
+ subreq->max_len = ULONG_MAX;
+ subreq->max_nr_segs = BIO_MAX_VECS;
+
+ if (!cachefiles_cres_file(cres)) {
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
+ return netfs_prepare_write_failed(subreq);
+ if (!cachefiles_cres_file(cres))
+ return netfs_prepare_write_failed(subreq);
+ }
+}
+
+static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct cachefiles_object *object = cachefiles_cres_object(cres);
+ struct cachefiles_cache *cache = object->volume->cache;
+ const struct cred *saved_cred;
+ size_t off, pre, post, len = subreq->len;
+ loff_t start = subreq->start;
+ int ret;
+
+ _enter("W=%x[%x] %llx-%llx",
+ wreq->debug_id, subreq->debug_index, start, start + len - 1);
+
+ /* We need to start on the cache granularity boundary */
+ off = start & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+ if (off) {
+ pre = CACHEFILES_DIO_BLOCK_SIZE - off;
+ if (pre >= len) {
+ netfs_write_subrequest_terminated(subreq, len, false);
+ return;
+ }
+ subreq->transferred += pre;
+ start += pre;
+ len -= pre;
+ iov_iter_advance(&subreq->io_iter, pre);
+ }
+
+ /* We also need to end on the cache granularity boundary */
+ post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1);
+ if (post) {
+ len -= post;
+ if (len == 0) {
+ netfs_write_subrequest_terminated(subreq, post, false);
+ return;
+ }
+ iov_iter_truncate(&subreq->io_iter, len);
+ }
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+ &start, &len, len, true);
+ cachefiles_end_secure(cache, saved_cred);
+ if (ret < 0) {
+ netfs_write_subrequest_terminated(subreq, ret, false);
+ return;
+ }
+
+ cachefiles_write(&subreq->rreq->cache_resources,
+ subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
/*
* Clean up an operation.
*/
@@ -638,8 +710,10 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.end_operation = cachefiles_end_operation,
.read = cachefiles_read,
.write = cachefiles_write,
+ .issue_write = cachefiles_issue_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .prepare_write_subreq = cachefiles_prepare_write_subreq,
.prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ee9caf7916fb..8c16bc5250ef 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -193,7 +193,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
* block, but do not exceed the file size, unless the original
* request already exceeds it.
*/
- new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
+ new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
if (new_end > end && new_end <= rreq->start + max_len)
rreq->len = new_end - rreq->start;
@@ -498,11 +498,6 @@ const struct netfs_request_ops ceph_netfs_ops = {
};
#ifdef CONFIG_CEPH_FSCACHE
-static void ceph_set_page_fscache(struct page *page)
-{
- set_page_fscache(page);
-}
-
static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
struct inode *inode = priv;
@@ -517,13 +512,9 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b
struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
- ceph_fscache_write_terminated, inode, caching);
+ ceph_fscache_write_terminated, inode, true, caching);
}
#else
-static inline void ceph_set_page_fscache(struct page *page)
-{
-}
-
static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
{
}
@@ -715,8 +706,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = wlen;
set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
@@ -800,8 +789,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
return AOP_WRITEPAGE_ACTIVATE;
}
- wait_on_page_fscache(page);
-
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -1075,7 +1062,7 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page) || PageFsCache(page)) {
+ if (PageWriteback(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
doutc(cl, "%p under writeback\n", page);
unlock_page(page);
@@ -1083,7 +1070,6 @@ get_more_pages:
}
doutc(cl, "waiting on writeback %p\n", page);
wait_on_page_writeback(page);
- wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
@@ -1268,8 +1254,6 @@ new_request:
}
set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
len += thp_size(page);
}
ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1513,7 +1497,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
if (r < 0)
return r;
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
WARN_ON_ONCE(!folio_test_locked(folio));
*pagep = &folio->page;
return 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7b2e77517f23..99561fddcb38 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,6 +577,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */
netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d4d1d799819e..8e6781e0b10b 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -11,7 +11,8 @@ netfs-y := \
main.o \
misc.o \
objects.o \
- output.o
+ write_collect.o \
+ write_issue.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3298c29b5548..a6bb03bea920 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -10,8 +10,11 @@
#include "internal.h"
/*
- * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * Unlock the folios in a read operation. We need to set PG_writeback on any
* folios we're going to write back before we unlock them.
+ *
+ * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
+ * PG_private_2 and do a direct write to the cache from here instead.
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
@@ -48,14 +51,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
- bool folio_started;
+ bool wback_to_cache = false;
+ bool folio_started = false;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
- folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -63,10 +66,16 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_start_fscache(folio);
- folio_started = true;
+ if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
+ &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_start_private_2(folio);
+ folio_started = true;
+ }
+ } else {
+ wback_to_cache |=
+ test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
}
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
@@ -98,6 +107,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
kfree(finfo);
}
folio_mark_uptodate(folio);
+ if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ filemap_dirty_folio(folio->mapping, folio);
+ }
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -116,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
}
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
+ unsigned long long *_start,
+ unsigned long long *_len,
+ unsigned long long i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
@@ -266,7 +282,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
- netfs_stat(&netfs_n_rh_readpage);
+ netfs_stat(&netfs_n_rh_read_folio);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
@@ -450,7 +466,7 @@ retry:
if (!netfs_is_cache_enabled(ctx) &&
netfs_skip_folio_read(folio, pos, len, false)) {
netfs_stat(&netfs_n_rh_write_zskip);
- goto have_folio_no_wait;
+ goto have_folio;
}
rreq = netfs_alloc_request(mapping, file,
@@ -491,10 +507,6 @@ retry:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
- ret = folio_wait_fscache_killable(folio);
- if (ret < 0)
- goto error;
-have_folio_no_wait:
*_folio = folio;
_leave(" = 0");
return 0;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 267b622d923b..1121601536d1 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
+/* Network filesystem high-level buffered write support.
*
* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -26,25 +26,15 @@ enum netfs_how_to_modify {
NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
};
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
-
static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- if (netfs_group && !folio_get_private(folio))
- folio_attach_private(folio, netfs_get_group(netfs_group));
-}
+ void *priv = folio_get_private(folio);
-#if IS_ENABLED(CONFIG_FSCACHE)
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
+ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
}
-#endif
/*
* Decide how we should modify a folio. We might be attempting to do
@@ -63,11 +53,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
bool maybe_trouble)
{
struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
loff_t pos = folio_file_pos(folio);
_enter("");
- if (netfs_folio_group(folio) != netfs_group)
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
if (folio_test_uptodate(folio))
@@ -81,16 +72,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
if (file->f_mode & FMODE_READ)
goto no_write_streaming;
- if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- goto no_write_streaming;
if (netfs_is_cache_enabled(ctx)) {
/* We don't want to get a streaming write on a file that loses
* caching service temporarily because the backing store got
* culled.
*/
- if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
goto no_write_streaming;
}
@@ -130,6 +117,37 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+/*
+ * Update i_size and estimate the update to i_blocks to reflect the additional
+ * data written into the pagecache until we can find out from the server what
+ * the values actually are.
+ */
+static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t i_size, loff_t pos, size_t copied)
+{
+ blkcnt_t add;
+ size_t gap;
+
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ return;
+ }
+
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(pos, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
+}
+
/**
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
@@ -160,7 +178,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
};
struct netfs_io_request *wreq = NULL;
struct netfs_folio *finfo;
- struct folio *folio;
+ struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
@@ -189,7 +207,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (!is_sync_kiocb(iocb))
wreq->iocb = iocb;
- wreq->cleanup = netfs_cleanup_buffered_write;
+ netfs_stat(&netfs_n_wh_writethrough);
+ } else {
+ netfs_stat(&netfs_n_wh_buffered_write);
}
do {
@@ -230,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
offset = pos & (flen - 1);
part = min_t(size_t, flen - offset, part);
+ /* Wait for writeback to complete. The writeback engine owns
+ * the info in folio->private and may change it until it
+ * removes the WB mark.
+ */
+ if (folio_get_private(folio) &&
+ folio_wait_writeback_killable(folio)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
if (signal_pending(current)) {
ret = written ? -EINTR : -ERESTARTSYS;
goto error_folio_unlock;
@@ -304,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
maybe_trouble = true;
iov_iter_revert(iter, copied);
copied = 0;
+ folio_unlock(folio);
goto retry;
}
netfs_set_group(folio, netfs_group);
@@ -351,41 +382,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
trace_netfs_folio(folio, trace);
/* Update the inode size if we moved the EOF marker */
- i_size = i_size_read(inode);
pos += copied;
- if (pos > i_size) {
- if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
- } else {
- i_size_write(inode, pos);
-#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
-#endif
- }
- }
+ i_size = i_size_read(inode);
+ if (pos > i_size)
+ netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
folio_mark_dirty(folio);
+ folio_unlock(folio);
} else {
- if (folio_test_dirty(folio))
- /* Sigh. mmap. */
- folio_clear_dirty_for_io(folio);
- /* We make multiple writes to the folio... */
- if (!folio_test_writeback(folio)) {
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
- folio_start_fscache(folio);
- if (wreq->iter.count == 0)
- trace_netfs_folio(folio, netfs_folio_trace_wthru);
- else
- trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
- }
- netfs_advance_writethrough(wreq, copied,
- offset + copied == flen);
+ netfs_advance_writethrough(wreq, &wbc, folio, copied,
+ offset + copied == flen,
+ &writethrough);
+ /* Folio unlocked */
}
retry:
- folio_unlock(folio);
folio_put(folio);
folio = NULL;
@@ -393,8 +405,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
} while (iov_iter_count(iter));
out:
+ if (likely(written) && ctx->ops->post_modify)
+ ctx->ops->post_modify(inode);
+
if (unlikely(wreq)) {
- ret2 = netfs_end_writethrough(wreq, iocb);
+ ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
wbc_detach_inode(&wbc);
if (ret2 == -EIOCBQUEUED)
return ret2;
@@ -505,9 +520,11 @@ EXPORT_SYMBOL(netfs_file_write_iter);
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
+ struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
+ struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
int err;
@@ -515,11 +532,13 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
sb_start_pagefault(inode->i_sb);
- if (folio_wait_writeback_killable(folio))
+ if (folio_lock_killable(folio) < 0)
goto out;
- if (folio_lock_killable(folio) < 0)
+ if (folio_wait_writeback_killable(folio)) {
+ ret = VM_FAULT_LOCKED;
goto out;
+ }
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
@@ -527,7 +546,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
goto out;
}
- if (netfs_folio_group(folio) != netfs_group) {
+ group = netfs_folio_group(folio);
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
err = filemap_fdatawait_range(inode->i_mapping,
folio_pos(folio),
@@ -551,708 +571,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
netfs_set_group(folio, netfs_group);
file_update_time(file);
+ if (ictx->ops->post_modify)
+ ictx->ops->post_modify(inode);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
return ret;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
-
-/*
- * Kill all the pages in the given range
- */
-static void netfs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- trace_netfs_folio(folio, netfs_folio_trace_kill);
- folio_clear_uptodate(folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void netfs_redirty_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
- trace_netfs_folio(folio, netfs_folio_trace_redirty);
- filemap_dirty_folio(mapping, folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- balance_dirty_pages_ratelimited(mapping);
-
- _leave("");
-}
-
-/*
- * Completion of write to server
- */
-static void netfs_pages_written_back(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
- struct netfs_folio *finfo;
- struct netfs_group *group = NULL;
- struct folio *folio;
- pgoff_t last;
- int gcount = 0;
-
- XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
-
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
-
- rcu_read_lock();
-
- last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, last) {
- WARN(!folio_test_writeback(folio),
- "bad %zx @%llx page %lx %lx\n",
- wreq->len, wreq->start, folio->index, last);
-
- if ((finfo = netfs_folio_info(folio))) {
- /* Streaming writes cannot be redirtied whilst under
- * writeback, so discard the streaming record.
- */
- folio_detach_private(folio);
- group = finfo->netfs_group;
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_s);
- kfree(finfo);
- } else if ((group = netfs_folio_group(folio))) {
- /* Need to detach the group pointer if the page didn't
- * get redirtied. If it has been redirtied, then it
- * must be within the same group.
- */
- if (folio_test_dirty(folio)) {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- goto end_wb;
- }
- if (folio_trylock(folio)) {
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- goto end_wb;
- }
-
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- rcu_read_lock();
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_clear);
- }
- end_wb:
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- xas_advance(&xas, folio_next_index(folio) - 1);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
- netfs_put_group_many(group, gcount);
- _leave("");
-}
-
-/*
- * Deal with the disposition of the folios that are under writeback to close
- * out the operation.
- */
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
-
- _enter("");
-
- switch (wreq->error) {
- case 0:
- netfs_pages_written_back(wreq);
- break;
-
- default:
- pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- case -EDQUOT:
- case -ENOSPC:
- netfs_redirty_pages(mapping, wreq->start, wreq->len);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- netfs_kill_pages(mapping, wreq->start, wreq->len);
- break;
- }
-
- if (wreq->error)
- mapping_set_error(mapping, wreq->error);
- if (wreq->netfs_ops->done)
- wreq->netfs_ops->done(wreq);
-}
-
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void netfs_extend_writeback(struct address_space *mapping,
- struct netfs_group *group,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool caching,
- size_t *_len,
- size_t *_top)
-{
- struct netfs_folio *finfo;
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned int i;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- void *priv;
- bool stop = true;
-
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- /* Has the folio moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- stop = false;
- len = folio_size(folio);
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- stop = true;
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group ||
- finfo->dirty_offset > 0) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- len = finfo->dirty_len;
- }
-
- *_top += folio_size(folio);
- index += folio_nr_pages(folio);
- *_count -= folio_nr_pages(folio);
- *_len += len;
- if (*_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_netfs_folio(folio, netfs_folio_trace_store_plus);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct netfs_io_request *wreq;
- struct netfs_folio *finfo;
- struct netfs_inode *ctx = netfs_inode(mapping->host);
- unsigned long long i_size = i_size_read(&ctx->inode);
- size_t len, max_len;
- bool caching = netfs_is_cache_enabled(ctx);
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
-
- wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
- NETFS_WRITEBACK);
- if (IS_ERR(wreq)) {
- folio_unlock(folio);
- return PTR_ERR(wreq);
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- trace_netfs_folio(folio, netfs_folio_trace_store);
-
- len = wreq->len;
- finfo = netfs_folio_info(folio);
- if (finfo) {
- start += finfo->dirty_offset;
- if (finfo->dirty_offset + finfo->dirty_len != len) {
- len = finfo->dirty_len;
- goto cant_expand;
- }
- len = finfo->dirty_len;
- }
-
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len)
- netfs_extend_writeback(mapping, group, xas, &count, start,
- max_len, caching, &len, &wreq->upper_len);
- }
-
-cant_expand:
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
- wreq->start = start;
- wreq->len = len;
-
- if (start < i_size) {
- _debug("write back %zx @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- wreq->cleanup = netfs_cleanup_buffered_write;
-
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
- wreq->upper_len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
- if (ret == 0 || ret == -EIOCBQUEUED)
- wbc->nr_to_write -= len / PAGE_SIZE;
- } else {
- _debug("write discard %zx @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- netfs_pages_written_back(wreq);
- ret = 0;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = 1");
- return 1;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static ssize_t netfs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- const struct netfs_folio *finfo;
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- void *priv;
- int skips = 0;
-
- _enter("%llx,%llx,", start, end);
-
-search_again:
- /* Find the first dirty page in the group. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- /* Skip any dirty folio that's not in the group of interest. */
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group) {
- folio_put(folio);
- continue;
- }
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio->index);
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
- folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- _leave(" = %zd [%llx]", ret, *_start);
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int netfs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = netfs_writepages_begin(mapping, wbc, group, &xas,
- _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * write some of the pending data back to the server
- */
-int netfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct netfs_group *group = NULL;
- loff_t start, end;
- int ret;
-
- _enter("");
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = netfs_writepages_region(mapping, wbc, group, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, wbc->range_end);
- }
-
-out:
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_writepages);
-
-/*
- * Deal with the disposition of a laundered folio.
- */
-static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
-{
- if (wreq->error) {
- pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
- mapping_set_error(wreq->mapping, wreq->error);
- }
-}
-
-/**
- * netfs_launder_folio - Clean up a dirty folio that's being invalidated
- * @folio: The folio to clean
- *
- * This is called to write back a folio that's being invalidated when an inode
- * is getting torn down. Ideally, writepages would be used instead.
- */
-int netfs_launder_folio(struct folio *folio)
-{
- struct netfs_io_request *wreq;
- struct address_space *mapping = folio->mapping;
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct netfs_group *group = netfs_folio_group(folio);
- struct bio_vec bvec;
- unsigned long long i_size = i_size_read(mapping->host);
- unsigned long long start = folio_pos(folio);
- size_t offset = 0, len;
- int ret = 0;
-
- if (finfo) {
- offset = finfo->dirty_offset;
- start += offset;
- len = finfo->dirty_len;
- } else {
- len = folio_size(folio);
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
- if (IS_ERR(wreq)) {
- ret = PTR_ERR(wreq);
- goto out;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- goto out_put;
-
- trace_netfs_folio(folio, netfs_folio_trace_launder);
-
- _debug("launder %llx-%llx", start, start + len - 1);
-
- /* Speculatively write to the cache. We have to fix this up later if
- * the store fails.
- */
- wreq->cleanup = netfs_cleanup_launder_folio;
-
- bvec_set_folio(&bvec, folio, len, offset);
- iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
-
-out_put:
- folio_detach_private(folio);
- netfs_put_group(group);
- kfree(finfo);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-out:
- folio_wait_fscache(folio);
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bee047e20f5d..608ba6416919 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -34,6 +34,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
unsigned long long start = iocb->ki_pos;
unsigned long long end = start + iov_iter_count(iter);
ssize_t ret, n;
+ size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
_enter("");
@@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
_debug("uw %llx-%llx", start, end);
- wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
- start, end - start,
- iocb->ki_flags & IOCB_DIRECT ?
- NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
if (IS_ERR(wreq))
return PTR_ERR(wreq);
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write));
+
{
/* If this is an async op and we're not using a bounce buffer,
* we have to save the source buffer as the iterator is only
@@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
* request.
*/
if (async || user_backed_iter(iter)) {
- n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
if (n < 0) {
ret = n;
goto out;
@@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
- wreq->len = iov_iter_count(&wreq->iter);
} else {
wreq->iter = *iter;
}
@@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->io_iter = wreq->iter;
}
+ __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -87,10 +93,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
if (async)
wreq->iocb = iocb;
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
- iocb->ki_flags & IOCB_DIRECT ?
- netfs_write_trace_dio_write :
- netfs_write_trace_unbuffered_write);
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
@@ -100,9 +103,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
-
+ smp_rmb(); /* Read error/transferred after RIP flag */
ret = wreq->error;
- _debug("waited = %zd", ret);
if (ret == 0) {
ret = wreq->transferred;
iocb->ki_pos += ret;
@@ -132,18 +134,20 @@ out:
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct netfs_inode *ictx = netfs_inode(inode);
- unsigned long long end;
ssize_t ret;
+ loff_t pos = iocb->ki_pos;
+ unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
trace_netfs_write_iter(iocb, from);
- netfs_stat(&netfs_n_rh_dio_write);
+ netfs_stat(&netfs_n_wh_dio_write);
ret = netfs_start_io_direct(inode);
if (ret < 0)
@@ -157,7 +161,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = file_update_time(file);
if (ret < 0)
goto out;
- ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* We could block if there are any pages in the range. */
+ ret = -EAGAIN;
+ if (filemap_range_has_page(mapping, pos, end))
+ if (filemap_invalidate_inode(inode, true, pos, end))
+ goto out;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ ret = filemap_invalidate_inode(inode, true, pos, end);
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 43a651ed8264..38637e5c9b57 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -166,6 +166,7 @@ struct fscache_write_request {
loff_t start;
size_t len;
bool set_bits;
+ bool using_pgpriv2;
netfs_io_terminated_t term_func;
void *term_func_priv;
};
@@ -182,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, last) {
- end_page_fscache(page);
+ folio_end_private_2(page_folio(page));
}
rcu_read_unlock();
}
@@ -197,8 +198,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
- wreq->set_bits);
+ if (wreq->using_pgpriv2)
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+ wreq->set_bits);
if (wreq->term_func)
wreq->term_func(wreq->term_func_priv, transferred_or_error,
@@ -212,7 +214,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
loff_t start, size_t len, loff_t i_size,
netfs_io_terminated_t term_func,
void *term_func_priv,
- bool cond)
+ bool using_pgpriv2, bool cond)
{
struct fscache_write_request *wreq;
struct netfs_cache_resources *cres;
@@ -230,6 +232,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
wreq->mapping = mapping;
wreq->start = start;
wreq->len = len;
+ wreq->using_pgpriv2 = using_pgpriv2;
wreq->set_bits = cond;
wreq->term_func = term_func;
wreq->term_func_priv = term_func_priv;
@@ -257,7 +260,8 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(mapping, start, len, cond);
+ if (using_pgpriv2)
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ec7045d24400..95e281a8af78 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -37,6 +37,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
+extern mempool_t netfs_request_pool;
+extern mempool_t netfs_subrequest_pool;
#ifdef CONFIG_PROC_FS
static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
@@ -91,22 +93,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
- * output.c
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what);
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
-
-/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
extern atomic_t netfs_n_rh_dio_read;
-extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
-extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_read_folio;
extern atomic_t netfs_n_rh_rreq;
extern atomic_t netfs_n_rh_sreq;
extern atomic_t netfs_n_rh_download;
@@ -123,6 +115,10 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_buffered_write;
+extern atomic_t netfs_n_wh_writethrough;
+extern atomic_t netfs_n_wh_dio_write;
+extern atomic_t netfs_n_wh_writepages;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@@ -149,6 +145,33 @@ static inline void netfs_stat_d(atomic_t *stat)
#endif
/*
+ * write_collect.c
+ */
+int netfs_folio_written_back(struct folio *folio);
+void netfs_write_collection_worker(struct work_struct *work);
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
+
+/*
+ * write_issue.c
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin);
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq);
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
+
+/*
* Miscellaneous functions.
*/
static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
@@ -168,7 +191,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
*/
static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
{
- if (netfs_group)
+ if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
refcount_inc(&netfs_group->ref);
return netfs_group;
}
@@ -178,7 +201,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou
*/
static inline void netfs_put_group(struct netfs_group *netfs_group)
{
- if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_dec_and_test(&netfs_group->ref))
netfs_group->free(netfs_group);
}
@@ -187,7 +212,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group)
*/
static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
{
- if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_sub_and_test(nr, &netfs_group->ref))
netfs_group->free(netfs_group);
}
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4261ad6c55b6..c93851b98368 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -99,145 +99,6 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
}
/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct folio *folio;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- if (xas_retry(&xas, folio))
- continue;
-
- /* We might have multiple writes from the same huge
- * folio, but we mustn't unlock a folio more than once.
- */
- if (have_unlocked && folio->index <= unlocked)
- continue;
- unlocked = folio_next_index(folio) - 1;
- trace_netfs_folio(folio, netfs_folio_trace_end_copy);
- folio_end_fscache(folio);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_io_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_copy_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false,
- netfs_sreq_trace_put_no_copy);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false,
- netfs_sreq_trace_put_merged);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- subreq->len, rreq->i_size, true);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_copy_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
-{
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
-}
-
-/*
* Handle a short read.
*/
static void netfs_rreq_short_read(struct netfs_io_request *rreq,
@@ -352,8 +213,13 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
unsigned int i;
size_t transferred = 0;
- for (i = 0; i < rreq->direct_bv_count; i++)
+ for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->error || subreq->transferred == 0)
@@ -409,9 +275,6 @@ again:
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
- if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq);
-
netfs_rreq_completed(rreq, was_async);
}
@@ -618,7 +481,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
set:
if (subreq->len > rreq->len)
- pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
rreq->debug_id, subreq->debug_index,
subreq->len, rreq->len);
@@ -643,8 +506,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
- struct iov_iter *io_iter,
- unsigned int *_debug_index)
+ struct iov_iter *io_iter)
{
struct netfs_io_subrequest *subreq;
enum netfs_io_source source;
@@ -653,11 +515,10 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
if (!subreq)
return false;
- subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
@@ -707,7 +568,6 @@ subreq_failed:
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
struct iov_iter io_iter;
- unsigned int debug_index = 0;
int ret;
_enter("R=%x %llx-%llx",
@@ -733,12 +593,12 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
- _debug("submit %llx + %zx >= %llx",
+ _debug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (rreq->origin == NETFS_DIO_READ &&
rreq->start + rreq->submitted >= rreq->i_size)
break;
- if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ if (!netfs_rreq_submit_slice(rreq, &io_iter))
break;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5e77618a7940..5f0f438e5d21 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/mempool.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -23,6 +24,11 @@ unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+static struct kmem_cache *netfs_request_slab;
+static struct kmem_cache *netfs_subrequest_slab;
+mempool_t netfs_request_pool;
+mempool_t netfs_subrequest_pool;
+
#ifdef CONFIG_PROC_FS
LIST_HEAD(netfs_io_requests);
DEFINE_SPINLOCK(netfs_proc_lock);
@@ -31,9 +37,9 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_COPY_TO_CACHE] = "CC",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITETHROUGH] = "WT",
- [NETFS_LAUNDER_WRITE] = "LW",
[NETFS_UNBUFFERED_WRITE] = "UW",
[NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
@@ -56,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
@@ -98,25 +104,54 @@ static int __init netfs_init(void)
{
int ret = -ENOMEM;
+ netfs_request_slab = kmem_cache_create("netfs_request",
+ sizeof(struct netfs_io_request), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_request_slab)
+ goto error_req;
+
+ if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0)
+ goto error_reqpool;
+
+ netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
+ sizeof(struct netfs_io_subrequest), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_subrequest_slab)
+ goto error_subreq;
+
+ if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
+ goto error_subreqpool;
+
if (!proc_mkdir("fs/netfs", NULL))
- goto error;
+ goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
- goto error_proc;
+ goto error_procfile;
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
- goto error_proc;
+ goto error_procfile;
#endif
ret = fscache_init();
if (ret < 0)
- goto error_proc;
+ goto error_fscache;
return 0;
-error_proc:
+error_fscache:
+error_procfile:
remove_proc_entry("fs/netfs", NULL);
-error:
+error_proc:
+ mempool_exit(&netfs_subrequest_pool);
+error_subreqpool:
+ kmem_cache_destroy(netfs_subrequest_slab);
+error_subreq:
+ mempool_exit(&netfs_request_pool);
+error_reqpool:
+ kmem_cache_destroy(netfs_request_slab);
+error_req:
return ret;
}
fs_initcall(netfs_init);
@@ -125,5 +160,9 @@ static void __exit netfs_exit(void)
{
fscache_exit();
remove_proc_entry("fs/netfs", NULL);
+ mempool_exit(&netfs_subrequest_pool);
+ kmem_cache_destroy(netfs_subrequest_slab);
+ mempool_exit(&netfs_request_pool);
+ kmem_cache_destroy(netfs_request_slab);
}
module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 90051ced8e2a..bc1fc54fb724 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -177,13 +177,11 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
*/
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct netfs_folio *finfo = NULL;
+ struct netfs_folio *finfo;
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
- folio_wait_fscache(folio);
-
if (!folio_test_private(folio))
return;
@@ -248,12 +246,6 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_private(folio))
return false;
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
-
fscache_note_page_release(netfs_i_cookie(ctx));
return true;
}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 610ceb5bd86c..c90d482b1650 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -6,6 +6,8 @@
*/
#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
#include "internal.h"
/*
@@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
+ struct kmem_cache *cache = mempool->pool_data;
bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
origin == NETFS_DIO_READ ||
origin == NETFS_DIO_WRITE);
bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
- GFP_KERNEL);
- if (!rreq)
- return ERR_PTR(-ENOMEM);
+ for (;;) {
+ rreq = mempool_alloc(mempool, GFP_KERNEL);
+ if (rreq)
+ break;
+ msleep(10);
+ }
+ memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
rreq->upper_len = len;
@@ -40,19 +47,27 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->inode = inode;
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
+ rreq->wsize = INT_MAX;
+ spin_lock_init(&rreq->lock);
+ INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
+ INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (cached)
+ if (cached) {
__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
+ /* Filesystem uses deprecated PG_private_2 marking. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+ }
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
- kfree(rreq);
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
return ERR_PTR(ret);
}
}
@@ -74,6 +89,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
@@ -82,6 +99,25 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
+
+ for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
+ stream = &rreq->io_streams[s];
+ while (!list_empty(&stream->subrequests)) {
+ subreq = list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
+ }
+}
+
+static void netfs_free_request_rcu(struct rcu_head *rcu)
+{
+ struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
+
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
+ netfs_stat_d(&netfs_n_rh_rreq);
}
static void netfs_free_request(struct work_struct *work)
@@ -106,8 +142,7 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
- kfree_rcu(rreq, rcu);
- netfs_stat_d(&netfs_n_rh_rreq);
+ call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
@@ -139,19 +174,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
-
- subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
- sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_WORK(&subreq->work, NULL);
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
- netfs_stat(&netfs_n_rh_sreq);
+ mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
+ struct kmem_cache *cache = mempool->pool_data;
+
+ for (;;) {
+ subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
+ GFP_KERNEL);
+ if (subreq)
+ break;
+ msleep(10);
}
+ memset(subreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&subreq->work, NULL);
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
return subreq;
}
@@ -173,7 +214,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
if (rreq->netfs_ops->free_subrequest)
rreq->netfs_ops->free_subrequest(subreq);
- kfree(subreq);
+ mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
}
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
deleted file mode 100644
index 625eb68f3e5a..000000000000
--- a/fs/netfs/output.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
- *
- * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
-#include "internal.h"
-
-/**
- * netfs_create_write_request - Create a write operation.
- * @wreq: The write request this is storing from.
- * @dest: The destination type
- * @start: Start of the region this write will modify
- * @len: Length of the modification
- * @worker: The worker function to handle the write(s)
- *
- * Allocate a write operation, set it up and add it to the list on a write
- * request.
- */
-struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
- enum netfs_io_source dest,
- loff_t start, size_t len,
- work_func_t worker)
-{
- struct netfs_io_subrequest *subreq;
-
- subreq = netfs_alloc_subrequest(wreq);
- if (subreq) {
- INIT_WORK(&subreq->work, worker);
- subreq->source = dest;
- subreq->start = start;
- subreq->len = len;
- subreq->debug_index = wreq->subreq_counter++;
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write);
- break;
- default:
- BUG();
- }
-
- subreq->io_iter = wreq->io_iter;
- iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
- iov_iter_truncate(&subreq->io_iter, subreq->len);
-
- trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
- refcount_read(&subreq->ref),
- netfs_sreq_trace_new);
- atomic_inc(&wreq->nr_outstanding);
- list_add_tail(&subreq->rreq_link, &wreq->subrequests);
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- }
-
- return subreq;
-}
-EXPORT_SYMBOL(netfs_create_write_request);
-
-/*
- * Process a completed write request once all the component operations have
- * been completed.
- */
-static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- size_t transferred = 0;
-
- _enter("R=%x[]", wreq->debug_id);
-
- trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
- wreq->transferred = transferred;
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (!subreq->error)
- continue;
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- /* Depending on the type of failure, this may prevent
- * writeback completion unless we're in disconnected
- * mode.
- */
- if (!wreq->error)
- wreq->error = subreq->error;
- break;
-
- case NETFS_WRITE_TO_CACHE:
- /* Failure doesn't prevent writeback completion unless
- * we're in disconnected mode.
- */
- if (subreq->error != -ENOBUFS)
- ctx->ops->invalidate_cache(wreq);
- break;
-
- default:
- WARN_ON_ONCE(1);
- if (!wreq->error)
- wreq->error = -EIO;
- return;
- }
- }
-
- wreq->cleanup(wreq);
-
- if (wreq->origin == NETFS_DIO_WRITE &&
- wreq->mapping->nrpages) {
- pgoff_t first = wreq->start >> PAGE_SHIFT;
- pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
- invalidate_inode_pages2_range(wreq->mapping, first, last);
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_end(wreq->inode);
-
- _debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
- wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (wreq->iocb) {
- wreq->iocb->ki_pos += transferred;
- if (wreq->iocb->ki_complete)
- wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : transferred);
- }
-
- netfs_clear_subrequests(wreq, was_async);
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = _op;
- struct netfs_io_request *wreq = subreq->rreq;
- unsigned int u;
-
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_done);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_done);
- break;
- case NETFS_INVALID_WRITE:
- break;
- default:
- BUG();
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(wreq, subreq, transferred_or_error,
- netfs_fail_write);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq excess write: R%x[%x] %zd > %zu - %zu",
- wreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
-
- if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
- wreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->io_iter.iter_type);
-
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&wreq->nr_outstanding);
- if (u == 0)
- netfs_write_terminated(wreq, was_async);
- else if (u == 1)
- wake_up_var(&wreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- goto out;
-
-failed:
- switch (subreq->source) {
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- break;
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_failed);
- set_bit(NETFS_RREQ_FAILED, &wreq->flags);
- wreq->error = subreq->error;
- break;
- default:
- break;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_write_subrequest_terminated);
-
-static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
-{
- struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_cache_resources *cres = &wreq->cache_resources;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-
- cres->ops->write(cres, subreq->start, &subreq->io_iter,
- netfs_write_subrequest_terminated, subreq);
-}
-
-static void netfs_write_to_cache_op_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- netfs_write_to_cache_op(subreq);
-}
-
-/**
- * netfs_queue_write_request - Queue a write request for attention
- * @subreq: The write request to be queued
- *
- * Queue the specified write request for processing by a worker thread. We
- * pass the caller's ref on the request to the worker thread.
- */
-void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
-{
- if (!queue_work(system_unbound_wq, &subreq->work))
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
-}
-EXPORT_SYMBOL(netfs_queue_write_request);
-
-/*
- * Set up a op for writing to the cache.
- */
-static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
-{
- struct netfs_cache_resources *cres = &wreq->cache_resources;
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- struct fscache_cookie *cookie = netfs_i_cookie(ctx);
- loff_t start = wreq->start;
- size_t len = wreq->len;
- int ret;
-
- if (!fscache_cookie_enabled(cookie)) {
- clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
- return;
- }
-
- _debug("write to cache");
- ret = fscache_begin_write_operation(cres, cookie);
- if (ret < 0)
- return;
-
- ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
- i_size_read(wreq->inode), true);
- if (ret < 0)
- return;
-
- subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
- netfs_write_to_cache_op_worker);
- if (!subreq)
- return;
-
- netfs_write_to_cache_op(subreq);
-}
-
-/*
- * Begin the process of writing out a chunk of data.
- *
- * We are given a write request that holds a series of dirty regions and
- * (partially) covers a sequence of folios, all of which are present. The
- * pages must have been marked as writeback as appropriate.
- *
- * We need to perform the following steps:
- *
- * (1) If encrypting, create an output buffer and encrypt each block of the
- * data into it, otherwise the output buffer will point to the original
- * folios.
- *
- * (2) If the data is to be cached, set up a write op for the entire output
- * buffer to the cache, if the cache wants to accept it.
- *
- * (3) If the data is to be uploaded (ie. not merely cached):
- *
- * (a) If the data is to be compressed, create a compression buffer and
- * compress the data into it.
- *
- * (b) For each destination we want to upload to, set up write ops to write
- * to that destination. We may need multiple writes if the data is not
- * contiguous or the span exceeds wsize for a server.
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what)
-{
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
-
- _enter("R=%x %llx-%llx f=%lx",
- wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
- wreq->flags);
-
- trace_netfs_write(wreq, what);
- if (wreq->len == 0 || wreq->iter.count == 0) {
- pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
- return -EIO;
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_begin(wreq->inode);
-
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
-
- /* Start the encryption/compression going. We can do that in the
- * background whilst we generate a list of write ops that we want to
- * perform.
- */
- // TODO: Encrypt or compress the region as appropriate
-
- /* We need to write all of the region to the cache */
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
- netfs_set_up_write_to_cache(wreq);
-
- /* However, we don't necessarily write all of the region to the server.
- * Caching of reads is being managed this way also.
- */
- if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (!may_wait)
- return -EIOCBQUEUED;
-
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- return wreq->error;
-}
-
-/*
- * Begin a write operation for writing through the pagecache.
- */
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
-{
- struct netfs_io_request *wreq;
- struct file *file = iocb->ki_filp;
-
- wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
- NETFS_WRITETHROUGH);
- if (IS_ERR(wreq))
- return wreq;
-
- trace_netfs_write(wreq, netfs_write_trace_writethrough);
-
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
- return wreq;
-}
-
-static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
-{
- struct netfs_inode *ictx = netfs_inode(wreq->inode);
- unsigned long long start;
- size_t len;
-
- if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- return;
-
- start = wreq->start + wreq->submitted;
- len = wreq->iter.count - wreq->submitted;
- if (!final) {
- len /= wreq->wsize; /* Round to number of maximum packets */
- len *= wreq->wsize;
- }
-
- ictx->ops->create_write_requests(wreq, start, len);
- wreq->submitted += len;
-}
-
-/*
- * Advance the state of the write operation used when writing through the
- * pagecache. Data has been copied into the pagecache that we need to append
- * to the request. If we've added more than wsize then we need to create a new
- * subrequest.
- */
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
-{
- _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
-
- wreq->iter.count += copied;
- wreq->io_iter.count += copied;
- if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
- netfs_submit_writethrough(wreq, false);
-
- return wreq->error;
-}
-
-/*
- * End a write operation used when writing through the pagecache.
- */
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
-{
- int ret = -EIOCBQUEUED;
-
- _enter("ic=%zu sb=%zu ws=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize);
-
- if (wreq->submitted < wreq->io_iter.count)
- netfs_submit_writethrough(wreq, true);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (is_sync_kiocb(iocb)) {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- return ret;
-}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index deeba9f9dcf5..0892768eea32 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -10,9 +10,8 @@
#include "internal.h"
atomic_t netfs_n_rh_dio_read;
-atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
-atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_read_folio;
atomic_t netfs_n_rh_rreq;
atomic_t netfs_n_rh_sreq;
atomic_t netfs_n_rh_download;
@@ -29,6 +28,10 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_buffered_write;
+atomic_t netfs_n_wh_writethrough;
+atomic_t netfs_n_wh_dio_write;
+atomic_t netfs_n_wh_writepages;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@@ -39,13 +42,17 @@ atomic_t netfs_n_wh_write_failed;
int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
- atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
- atomic_read(&netfs_n_rh_readpage),
+ atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n",
+ atomic_read(&netfs_n_wh_buffered_write),
+ atomic_read(&netfs_n_wh_writethrough),
+ atomic_read(&netfs_n_wh_dio_write),
+ atomic_read(&netfs_n_wh_writepages));
seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
new file mode 100644
index 000000000000..60112e4b2c5e
--- /dev/null
+++ b/fs/netfs/write_collect.c
@@ -0,0 +1,808 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write subrequest result collection, assessment
+ * and retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING 0x01 /* A front op was still pending */
+#define SOME_EMPTY 0x02 /* One of more streams are empty */
+#define ALL_EMPTY 0x04 /* All streams are empty */
+#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */
+#define NEED_REASSESS 0x10 /* Need to loop round and reassess */
+#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */
+#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */
+#define BUFFERED 0x80 /* The pagecache needs cleaning up */
+#define NEED_RETRY 0x100 /* A front op requests retrying */
+#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */
+
+/*
+ * Successful completion of write of a folio to the server and/or cache. Note
+ * that we are not allowed to lock the folio here on pain of deadlocking with
+ * truncate.
+ */
+int netfs_folio_written_back(struct folio *folio)
+{
+ enum netfs_folio_trace why = netfs_folio_trace_clear;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ int gcount = 0;
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under writeback,
+ * so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ kfree(finfo);
+ why = netfs_folio_trace_clear_s;
+ goto end_wb;
+ }
+
+ if ((group = netfs_folio_group(folio))) {
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ why = netfs_folio_trace_clear_cc;
+ folio_detach_private(folio);
+ goto end_wb;
+ }
+
+ /* Need to detach the group pointer if the page didn't get
+ * redirtied. If it has been redirtied, then it must be within
+ * the same group.
+ */
+ why = netfs_folio_trace_redirtied;
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ why = netfs_folio_trace_clear_g;
+ }
+ }
+
+end_wb:
+ trace_netfs_folio(folio, why);
+ folio_end_writeback(folio);
+ return gcount;
+}
+
+/*
+ * Get hold of a folio we have under writeback. We don't want to get the
+ * refcount on it.
+ */
+static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
+{
+ XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
+ struct folio *folio;
+
+ rcu_read_lock();
+
+ for (;;) {
+ xas_reset(&xas);
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (!folio || xa_is_value(folio))
+ kdebug("R=%08x: folio %lx (%llx) not present",
+ wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
+ BUG_ON(!folio || xa_is_value(folio));
+
+ if (folio == xas_reload(&xas))
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (WARN_ONCE(!folio_test_writeback(folio),
+ "R=%08x: folio %lx is not under writeback\n",
+ wreq->debug_id, folio->index)) {
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+ }
+ return folio;
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+ unsigned long long collected_to,
+ unsigned int *notes)
+{
+ for (;;) {
+ struct folio *folio;
+ struct netfs_folio *finfo;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ finfo = netfs_folio_info(folio);
+ flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+ trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+ if (fpos + fsize > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, fpos + fsize,
+ netfs_contig_trace_unlock);
+ wreq->contiguity = fpos + fsize;
+ }
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ wreq->cleaned_to = fpos + fsize;
+ *notes |= MADE_PROGRESS;
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+}
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct list_head *next;
+
+ _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+ if (list_empty(&stream->subrequests))
+ return;
+
+ if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ wreq->netfs_ops->retry_request)
+ wreq->netfs_ops->retry_request(wreq, stream);
+
+ if (unlikely(stream->failed))
+ return;
+
+ /* If there's no renegotiation to do, just resend each failed subreq. */
+ if (!stream->prepare_write) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ }
+ }
+ return;
+ }
+
+ next = stream->subrequests.next;
+
+ do {
+ struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+ unsigned long long start, len;
+ size_t part;
+ bool boundary = false;
+
+ /* Go through the stream and find the next span of contiguous
+ * data that we then rejig (cifs, for example, needs the wsize
+ * renegotiating) and reissue.
+ */
+ from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ to = from;
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ return;
+
+ list_for_each_continue(next, &stream->subrequests) {
+ subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ if (subreq->start + subreq->transferred != start + len ||
+ test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ to = subreq;
+ len += to->len;
+ }
+
+ /* Work through the sublist. */
+ subreq = from;
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!len)
+ break;
+ /* Renegotiate max_len (wsize) */
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = part;
+ subreq->start = start;
+ subreq->transferred = 0;
+ len -= part;
+ start += part;
+ if (len && subreq == to &&
+ __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+ boundary = true;
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ if (subreq == to)
+ break;
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess; if we used the same number, then we're done.
+ */
+ if (!len) {
+ if (subreq == to)
+ continue;
+ list_for_each_entry_safe_from(subreq, tmp,
+ &stream->subrequests, rreq_link) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ if (subreq == to)
+ break;
+ }
+ continue;
+ }
+
+ /* We ran out of subrequests, so we need to allocate some more
+ * and insert them after.
+ */
+ do {
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = to->source;
+ subreq->start = start;
+ subreq->max_len = len;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
+ subreq->stream_nr = to->stream_nr;
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+ list_add(&subreq->rreq_link, &to->rreq_link);
+ to = list_next_entry(to, rreq_link);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = min(len, wreq->wsize);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = subreq->transferred + part;
+ len -= part;
+ start += part;
+ if (!len && boundary) {
+ __set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+ boundary = false;
+ }
+
+ netfs_reissue_write(stream, subreq);
+ if (!len)
+ break;
+
+ } while (len);
+
+ } while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it. If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+static void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
+
+ /* Wait for all outstanding I/O to quiesce before performing retries as
+ * we may need to renegotiate the I/O sizes.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ }
+ }
+
+ // TODO: Enc: Fetch changed partial pages
+ // TODO: Enc: Reencrypt content if needed.
+ // TODO: Enc: Wind back transferred point.
+ // TODO: Enc: Mark cache pages for retry.
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->need_retry) {
+ stream->need_retry = false;
+ netfs_retry_write_stream(wreq, stream);
+ }
+ }
+}
+
+/*
+ * Collect and assess the results of various write subrequests. We may need to
+ * retry some of the results - or even do an RMW cycle for content crypto.
+ *
+ * Note that we have a number of parallel, overlapping lists of subrequests,
+ * one to the server and one to the local cache for example, which may not be
+ * the same size or starting position and may not even correspond in boundary
+ * alignment.
+ */
+static void netfs_collect_write_results(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *front, *remove;
+ struct netfs_io_stream *stream;
+ unsigned long long collected_to;
+ unsigned int notes;
+ int s;
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ trace_netfs_collect(wreq);
+ trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
+
+reassess_streams:
+ smp_rmb();
+ collected_to = ULLONG_MAX;
+ if (wreq->origin == NETFS_WRITEBACK)
+ notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
+ else if (wreq->origin == NETFS_WRITETHROUGH)
+ notes = ALL_EMPTY | BUFFERED;
+ else
+ notes = ALL_EMPTY;
+
+ /* Remove completed subrequests from the front of the streams and
+ * advance the completion point on each stream. We stop when we hit
+ * something that's in progress. The issuer thread may be adding stuff
+ * to the tail whilst we're doing this.
+ *
+ * We must not, however, merge in discontiguities that span whole
+ * folios that aren't under writeback. This is made more complicated
+ * by the folios in the gap being of unpredictable sizes - if they even
+ * exist - but we don't want to look them up.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ loff_t rstart, rend;
+
+ stream = &wreq->io_streams[s];
+ /* Read active flag before list pointers */
+ if (!smp_load_acquire(&stream->active))
+ continue;
+
+ front = stream->front;
+ while (front) {
+ trace_netfs_collect_sreq(wreq, front);
+ //_debug("sreq [%x] %llx %zx/%zx",
+ // front->debug_index, front->start, front->transferred, front->len);
+
+ /* Stall if there may be a discontinuity. */
+ rstart = round_down(front->start, PAGE_SIZE);
+ if (rstart > wreq->contiguity) {
+ if (wreq->contiguity > stream->collected_to) {
+ trace_netfs_collect_gap(wreq, stream,
+ wreq->contiguity, 'D');
+ stream->collected_to = wreq->contiguity;
+ }
+ notes |= REASSESS_DISCONTIG;
+ break;
+ }
+ rend = round_up(front->start + front->len, PAGE_SIZE);
+ if (rend > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, rend,
+ netfs_contig_trace_collect);
+ wreq->contiguity = rend;
+ if (notes & REASSESS_DISCONTIG)
+ notes |= NEED_REASSESS;
+ }
+ notes &= ~MAYBE_DISCONTIG;
+
+ /* Stall if the front is still undergoing I/O. */
+ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ notes |= HIT_PENDING;
+ break;
+ }
+ smp_rmb(); /* Read counters after I-P flag. */
+
+ if (stream->failed) {
+ stream->collected_to = front->start + front->len;
+ notes |= MADE_PROGRESS | SAW_FAILURE;
+ goto cancel;
+ }
+ if (front->start + front->transferred > stream->collected_to) {
+ stream->collected_to = front->start + front->transferred;
+ stream->transferred = stream->collected_to - wreq->start;
+ notes |= MADE_PROGRESS;
+ }
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ stream->failed = true;
+ stream->error = front->error;
+ if (stream->source == NETFS_UPLOAD_TO_SERVER)
+ mapping_set_error(wreq->mapping, front->error);
+ notes |= NEED_REASSESS | SAW_FAILURE;
+ break;
+ }
+ if (front->transferred < front->len) {
+ stream->need_retry = true;
+ notes |= NEED_RETRY | MADE_PROGRESS;
+ break;
+ }
+
+ cancel:
+ /* Remove if completely consumed. */
+ spin_lock(&wreq->lock);
+
+ remove = front;
+ list_del_init(&front->rreq_link);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ stream->front = front;
+ if (!front) {
+ unsigned long long jump_to = atomic64_read(&wreq->issued_to);
+
+ if (stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
+ stream->collected_to = jump_to;
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+ netfs_put_subrequest(remove, false,
+ notes & SAW_FAILURE ?
+ netfs_sreq_trace_put_cancel :
+ netfs_sreq_trace_put_done);
+ }
+
+ if (front)
+ notes &= ~ALL_EMPTY;
+ else
+ notes |= SOME_EMPTY;
+
+ if (stream->collected_to < collected_to)
+ collected_to = stream->collected_to;
+ }
+
+ if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
+ wreq->collected_to = collected_to;
+
+ /* If we have an empty stream, we need to jump it forward over any gap
+ * otherwise the collection point will never advance.
+ *
+ * Note that the issuer always adds to the stream with the lowest
+ * so-far submitted start, so if we see two consecutive subreqs in one
+ * stream with nothing between then in another stream, then the second
+ * stream has a gap that can be jumped.
+ */
+ if (notes & SOME_EMPTY) {
+ unsigned long long jump_to = wreq->start + wreq->len;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ !stream->front &&
+ stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
+ stream->collected_to = jump_to;
+ }
+ }
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active)
+ trace_netfs_collect_stream(wreq, stream);
+ }
+
+ trace_netfs_collect_state(wreq, wreq->collected_to, notes);
+
+ /* Unlock any folios that we have now finished with. */
+ if (notes & BUFFERED) {
+ unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
+
+ if (wreq->cleaned_to < clean_to)
+ netfs_writeback_unlock_folios(wreq, clean_to, &notes);
+ } else {
+ wreq->cleaned_to = wreq->collected_to;
+ }
+
+ // TODO: Discard encryption buffers
+
+ /* If all streams are discontiguous with the last folio we cleared, we
+ * may need to skip a set of folios.
+ */
+ if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
+ unsigned long long jump_to = ULLONG_MAX;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active && stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
+ wreq->contiguity = jump_to;
+ wreq->cleaned_to = jump_to;
+ wreq->collected_to = jump_to;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->collected_to < jump_to)
+ stream->collected_to = jump_to;
+ }
+ //cond_resched();
+ notes |= MADE_PROGRESS;
+ goto reassess_streams;
+ }
+
+ if (notes & NEED_RETRY)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+ clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+ }
+
+ if (notes & NEED_REASSESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+ if (notes & MADE_PROGRESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+
+out:
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ _leave(" = %x", notes);
+ return;
+
+need_retry:
+ /* Okay... We're going to have to retry one or both streams. Note
+ * that any partially completed op will have had any wholly transferred
+ * folios removed from it.
+ */
+ _debug("retry");
+ netfs_retry_writes(wreq);
+ goto out;
+}
+
+/*
+ * Perform the collection of subrequests, folios and encryption buffers.
+ */
+void netfs_write_collection_worker(struct work_struct *work)
+{
+ struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ size_t transferred;
+ int s;
+
+ _enter("R=%x", wreq->debug_id);
+
+ netfs_see_request(wreq, netfs_rreq_trace_see_work);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+
+ netfs_collect_write_results(wreq);
+
+ /* We're done when the app thread has finished posting subreqs and all
+ * the queues in all the streams are empty.
+ */
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ smp_rmb(); /* Read ALL_QUEUED before lists. */
+
+ transferred = LONG_MAX;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ struct netfs_io_stream *stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+ if (!list_empty(&stream->subrequests)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ if (stream->transferred < transferred)
+ transferred = stream->transferred;
+ }
+
+ /* Okay, declare that all I/O is complete. */
+ wreq->transferred = transferred;
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ if (wreq->io_streams[1].active &&
+ wreq->io_streams[1].failed) {
+ /* Cache write failure doesn't prevent writeback completion
+ * unless we're in disconnected mode.
+ */
+ ictx->ops->invalidate_cache(wreq);
+ }
+
+ if (wreq->cleanup)
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ /* mmap may have got underfoot and we may now have folios
+ * locally covering the region we just wrote. Attempt to
+ * discard the folios, but leave in place any modified locally.
+ * ->write_iter() is prevented from interfering by the DIO
+ * counter.
+ */
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += wreq->transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+ wreq->iocb = VFS_PTR_POISON;
+ }
+
+ netfs_clear_subrequests(wreq, false);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+{
+ if (!work_pending(&wreq->work)) {
+ netfs_get_request(wreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &wreq->work))
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * @_op: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the library that a contributory write I/O operation has
+ * terminated, one way or another, and that it should collect the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred or a
+ * negative error code. The library will look after reissuing I/O operations
+ * as appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ *
+ * When this is called, ownership of the subrequest is transferred back to the
+ * library, along with a ref.
+ *
+ * Note that %_op is a void* so that the function can be passed to
+ * kiocb::term_func without the need for a casting wrapper.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ if (subreq->error == -EAGAIN)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ else
+ set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
+
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ break;
+ default:
+ break;
+ }
+ trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
+ set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+ } else {
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (subreq->transferred < subreq->len)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * transferring a ref to it if we were the ones to do so.
+ */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+ netfs_wake_write_collector(wreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
new file mode 100644
index 000000000000..e190043bc0da
--- /dev/null
+++ b/fs/netfs/write_issue.c
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level (buffered) writeback.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ *
+ * To support network filesystems with local caching, we manage a situation
+ * that can be envisioned like the following:
+ *
+ * +---+---+-----+-----+---+----------+
+ * Folios: | | | | | | |
+ * +---+---+-----+-----+---+----------+
+ *
+ * +------+------+ +----+----+
+ * Upload: | | |.....| | |
+ * (Stream 0) +------+------+ +----+----+
+ *
+ * +------+------+------+------+------+
+ * Cache: | | | | | |
+ * (Stream 1) +------+------+------+------+------+
+ *
+ * Where we have a sequence of folios of varying sizes that we need to overlay
+ * with multiple parallel streams of I/O requests, where the I/O requests in a
+ * stream may also be of various sizes (in cifs, for example, the sizes are
+ * negotiated with the server; in something like ceph, they may represent the
+ * sizes of storage objects).
+ *
+ * The sequence in each stream may contain gaps and noncontiguous subrequests
+ * may be glued together into single vectored write RPCs.
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+
+/*
+ * Kill all dirty folios in the event of an unrecoverable error, starting with
+ * a locked folio we've already obtained from writeback_iter().
+ */
+static void netfs_kill_dirty_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ int error = 0;
+
+ do {
+ enum netfs_folio_trace why = netfs_folio_trace_kill;
+ struct netfs_group *group = NULL;
+ struct netfs_folio *finfo = NULL;
+ void *priv;
+
+ priv = folio_detach_private(folio);
+ if (priv) {
+ finfo = __netfs_folio_info(priv);
+ if (finfo) {
+ /* Kill folio from streaming write. */
+ group = finfo->netfs_group;
+ why = netfs_folio_trace_kill_s;
+ } else {
+ group = priv;
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ /* Kill copy-to-cache folio */
+ why = netfs_folio_trace_kill_cc;
+ group = NULL;
+ } else {
+ /* Kill folio with group */
+ why = netfs_folio_trace_kill_g;
+ }
+ }
+ }
+
+ trace_netfs_folio(folio, why);
+
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
+
+ netfs_put_group(group);
+ kfree(finfo);
+
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+}
+
+/*
+ * Create a write request and set it up appropriately for the origin type.
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_inode *ictx;
+
+ wreq = netfs_alloc_request(mapping, file, start, 0, origin);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ _enter("R=%x", wreq->debug_id);
+
+ ictx = netfs_inode(wreq->inode);
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+
+ wreq->contiguity = wreq->start;
+ wreq->cleaned_to = wreq->start;
+ INIT_WORK(&wreq->work, netfs_write_collection_worker);
+
+ wreq->io_streams[0].stream_nr = 0;
+ wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
+ wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
+ wreq->io_streams[0].issue_write = ictx->ops->issue_write;
+ wreq->io_streams[0].collected_to = start;
+ wreq->io_streams[0].transferred = LONG_MAX;
+
+ wreq->io_streams[1].stream_nr = 1;
+ wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
+ wreq->io_streams[1].collected_to = start;
+ wreq->io_streams[1].transferred = LONG_MAX;
+ if (fscache_resources_valid(&wreq->cache_resources)) {
+ wreq->io_streams[1].avail = true;
+ wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
+ wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
+ }
+
+ return wreq;
+}
+
+/**
+ * netfs_prepare_write_failed - Note write preparation failed
+ * @subreq: The subrequest to mark
+ *
+ * Mark a subrequest to note that preparation for write failed.
+ */
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
+}
+EXPORT_SYMBOL(netfs_prepare_write_failed);
+
+/*
+ * Prepare a write subrequest. We need to allocate a new subrequest
+ * if we don't have one.
+ */
+static void netfs_prepare_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = stream->source;
+ subreq->start = start;
+ subreq->max_len = ULONG_MAX;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->stream_nr = stream->stream_nr;
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = wreq->wsize;
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (stream->prepare_write)
+ stream->prepare_write(subreq);
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* We add to the end of the list whilst the collector may be walking
+ * the list. The collector only goes nextwards and uses the lock to
+ * remove entries off of the front.
+ */
+ spin_lock(&wreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Write list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+
+ stream->construct = subreq;
+}
+
+/*
+ * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
+ * operation. The operation may be asynchronous and should call
+ * netfs_write_subrequest_terminated() when complete.
+ */
+static void netfs_do_issue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+
+ _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+
+ // TODO: Use encrypted buffer
+ if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter,
+ subreq->start + subreq->transferred - wreq->start);
+ iov_iter_truncate(&subreq->io_iter,
+ subreq->len - subreq->transferred);
+ } else {
+ iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ stream->issue_write(subreq);
+}
+
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_do_issue_write(stream, subreq);
+}
+
+static void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+
+ if (!subreq)
+ return;
+ stream->construct = NULL;
+
+ if (subreq->start + subreq->len > wreq->start + wreq->submitted)
+ wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+ netfs_do_issue_write(stream, subreq);
+}
+
+/*
+ * Add data to the write subrequest, dispatching each as we fill it up or if it
+ * is discontiguous with the previous. We only fill one part at a time so that
+ * we can avoid overrunning the credits obtained (cifs) and try to parallelise
+ * content-crypto preparation with network writes.
+ */
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+ size_t part;
+
+ if (!stream->avail) {
+ _leave("no write");
+ return len;
+ }
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+
+ if (subreq && start != subreq->start + subreq->len) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ if (!stream->construct)
+ netfs_prepare_write(wreq, stream, start);
+ subreq = stream->construct;
+
+ part = min(subreq->max_len - subreq->len, len);
+ _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ subreq->len += part;
+ subreq->nr_segs++;
+
+ if (subreq->len >= subreq->max_len ||
+ subreq->nr_segs >= subreq->max_nr_segs ||
+ to_eof) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ return part;
+}
+
+/*
+ * Write some of a pending folio data back to the server.
+ */
+static int netfs_write_folio(struct netfs_io_request *wreq,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ struct netfs_io_stream *stream;
+ struct netfs_group *fgroup; /* TODO: Use this with ceph */
+ struct netfs_folio *finfo;
+ size_t fsize = folio_size(folio), flen = fsize, foff = 0;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false, streamw = false;
+ bool debug = false;
+
+ _enter("");
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(wreq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ return 0;
+ }
+
+ if (fpos + fsize > wreq->i_size)
+ wreq->i_size = i_size;
+
+ fgroup = netfs_folio_group(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ foff = finfo->dirty_offset;
+ flen = foff + finfo->dirty_len;
+ streamw = true;
+ }
+
+ if (wreq->origin == NETFS_WRITETHROUGH) {
+ to_eof = false;
+ if (flen > i_size - fpos)
+ flen = i_size - fpos;
+ } else if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ if (!streamw)
+ folio_zero_segment(folio, flen, fsize);
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+ flen -= foff;
+
+ _debug("folio %zx %zx %zx", foff, flen, fsize);
+
+ /* Deal with discontinuities in the stream of dirty pages. These can
+ * arise from a number of sources:
+ *
+ * (1) Intervening non-dirty pages from random-access writes, multiple
+ * flushers writing back different parts simultaneously and manual
+ * syncing.
+ *
+ * (2) Partially-written pages from write-streaming.
+ *
+ * (3) Pages that belong to a different write-back group (eg. Ceph
+ * snapshots).
+ *
+ * (4) Actually-clean pages that were marked for write to the cache
+ * when they were read. Note that these appear as a special
+ * write-back group.
+ */
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ netfs_issue_write(wreq, upload);
+ } else if (fgroup != wreq->group) {
+ /* We can't write this page to the server yet. */
+ kdebug("wrong group");
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ netfs_issue_write(wreq, upload);
+ netfs_issue_write(wreq, cache);
+ return 0;
+ }
+
+ if (foff > 0)
+ netfs_issue_write(wreq, upload);
+ if (streamw)
+ netfs_issue_write(wreq, cache);
+
+ /* Flip the page to the writeback state and unlock. If we're called
+ * from write-through, then the page has already been put into the wb
+ * state.
+ */
+ if (wreq->origin == NETFS_WRITEBACK)
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ if (!fscache_resources_valid(&wreq->cache_resources)) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+ netfs_issue_write(wreq, upload);
+ netfs_folio_written_back(folio);
+ return 0;
+ }
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+ } else if (!upload->construct) {
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+ }
+
+ /* Move the submission point forward to allow for write-streaming data
+ * not starting at the front of the page. We don't do write-streaming
+ * with the cache as the cache requires DIO alignment.
+ *
+ * Also skip uploading for data that's been read and just needs copying
+ * to the cache.
+ */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ stream->submit_max_len = fsize;
+ stream->submit_off = foff;
+ stream->submit_len = flen;
+ if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+ (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
+ stream->submit_off = UINT_MAX;
+ stream->submit_len = 0;
+ stream->submit_max_len = 0;
+ }
+ }
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ for (;;) {
+ ssize_t part;
+ size_t lowest_off = ULONG_MAX;
+ int choose_s = -1;
+
+ /* Always add to the lowest-submitted stream first. */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->submit_len > 0 &&
+ stream->submit_off < lowest_off) {
+ lowest_off = stream->submit_off;
+ choose_s = s;
+ }
+ }
+
+ if (choose_s < 0)
+ break;
+ stream = &wreq->io_streams[choose_s];
+
+ part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+ stream->submit_len, to_eof);
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_off += part;
+ stream->submit_max_len -= part;
+ if (part > stream->submit_len)
+ stream->submit_len = 0;
+ else
+ stream->submit_len -= part;
+ if (part > 0)
+ debug = true;
+ }
+
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (!debug)
+ kdebug("R=%x: No submit", wreq->debug_id);
+
+ if (flen < fsize)
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_inode *ictx = netfs_inode(mapping->host);
+ struct netfs_io_request *wreq = NULL;
+ struct folio *folio;
+ int error = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ mutex_lock(&ictx->wb_lock);
+ else if (!mutex_trylock(&ictx->wb_lock))
+ return 0;
+
+ /* Need the first folio to be able to set up the op. */
+ folio = writeback_iter(mapping, wbc, NULL, &error);
+ if (!folio)
+ goto out;
+
+ wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ error = PTR_ERR(wreq);
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_writeback);
+ netfs_stat(&netfs_n_wh_writepages);
+
+ do {
+ _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+
+ /* It appears we don't have to handle cyclic writeback wrapping. */
+ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
+
+ if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
+ unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
+ set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ wreq->netfs_ops->begin_writeback(wreq);
+ }
+
+ error = netfs_write_folio(wreq, wbc, folio);
+ if (error < 0)
+ break;
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", error);
+ return error;
+
+couldnt_start:
+ netfs_kill_dirty_pages(mapping, wbc, folio);
+out:
+ mutex_unlock(&ictx->wb_lock);
+ _leave(" = %d", error);
+ return error;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
+
+ mutex_lock(&ictx->wb_lock);
+
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq)) {
+ mutex_unlock(&ictx->wb_lock);
+ return wreq;
+ }
+
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+ return wreq;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache)
+{
+ _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
+
+ if (!*writethrough_cache) {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+
+ /* We can make multiple writes to the folio... */
+ folio_start_writeback(folio);
+ if (wreq->len == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ *writethrough_cache = folio;
+ }
+
+ wreq->len += copied;
+ if (!to_page_end)
+ return 0;
+
+ *writethrough_cache = NULL;
+ return netfs_write_folio(wreq, wbc, folio);
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ int ret;
+
+ _enter("R=%x", wreq->debug_id);
+
+ if (writethrough_cache)
+ netfs_write_folio(wreq, wbc, writethrough_cache);
+
+ netfs_issue_write(wreq, &wreq->io_streams[0]);
+ netfs_issue_write(wreq, &wreq->io_streams[1]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ ret = wreq->error;
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.
+ */
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ ssize_t part;
+ loff_t start = wreq->start;
+ int error = 0;
+
+ _enter("%zx", len);
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ while (len) {
+ // TODO: Prepare content encryption
+
+ _debug("unbuffered %zx", len);
+ part = netfs_advance_write(wreq, upload, start, len, false);
+ start += part;
+ len -= part;
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
+ }
+ if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
+ break;
+ }
+
+ netfs_issue_write(wreq, upload);
+
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+ if (list_empty(&upload->subrequests))
+ netfs_wake_write_collector(wreq, false);
+
+ _leave(" = %d", error);
+ return error;
+}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 407c6e15afe2..6bd127e6683d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -433,7 +433,7 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
return;
/* Cancel any unstarted writes on this page */
nfs_wb_folio_cancel(inode, folio);
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio);
}
@@ -500,7 +500,7 @@ static int nfs_launder_folio(struct folio *folio)
dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
inode->i_ino, folio_pos(folio));
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
ret = nfs_wb_folio(inode, folio);
trace_nfs_launder_folio_done(inode, folio, ret);
return ret;
@@ -593,8 +593,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
/* make sure the cache has finished storing the page */
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0) {
+ if (folio_test_private_2(folio) && /* [DEPRECATED] */
+ folio_wait_private_2_killable(folio) < 0) {
ret = VM_FAULT_RETRY;
goto out;
}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index e3cb4923316b..fbed0027996f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -81,6 +81,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
@@ -101,10 +103,10 @@ extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- if (folio_test_fscache(folio)) {
+ if (folio_test_private_2(folio)) { /* [DEPRECATED] */
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- folio_wait_fscache(folio);
+ folio_wait_private_2(folio);
}
fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
return true;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5de85d725fb9..2329cbb0e446 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2120,10 +2120,10 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
if (folio_test_private(src))
return -EBUSY;
- if (folio_test_fscache(src)) {
+ if (folio_test_private_2(src)) { /* [DEPRECATED] */
if (mode == MIGRATE_ASYNC)
return -EBUSY;
- folio_wait_fscache(src);
+ folio_wait_private_2(src);
}
return migrate_folio(mapping, dst, src, mode);
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 2927bd174a88..2517dc242386 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -2,6 +2,7 @@
config CIFS
tristate "SMB3 and CIFS support (advanced network filesystem)"
depends on INET
+ select NETFS_SUPPORT
select NLS
select NLS_UCS2_UTILS
select CRYPTO
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 39277c37185c..6e1698614745 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -371,9 +371,13 @@ static struct kmem_cache *cifs_inode_cachep;
static struct kmem_cache *cifs_req_cachep;
static struct kmem_cache *cifs_mid_cachep;
static struct kmem_cache *cifs_sm_req_cachep;
+static struct kmem_cache *cifs_io_request_cachep;
+static struct kmem_cache *cifs_io_subrequest_cachep;
mempool_t *cifs_sm_req_poolp;
mempool_t *cifs_req_poolp;
mempool_t *cifs_mid_poolp;
+mempool_t cifs_io_request_pool;
+mempool_t cifs_io_subrequest_pool;
static struct inode *
cifs_alloc_inode(struct super_block *sb)
@@ -986,61 +990,6 @@ out:
return root;
}
-
-static ssize_t
-cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- ssize_t rc;
- struct inode *inode = file_inode(iocb->ki_filp);
-
- if (iocb->ki_flags & IOCB_DIRECT)
- return cifs_user_readv(iocb, iter);
-
- rc = cifs_revalidate_mapping(inode);
- if (rc)
- return rc;
-
- return generic_file_read_iter(iocb, iter);
-}
-
-static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct cifsInodeInfo *cinode = CIFS_I(inode);
- ssize_t written;
- int rc;
-
- if (iocb->ki_filp->f_flags & O_DIRECT) {
- written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
- cifs_zap_mapping(inode);
- cifs_dbg(FYI,
- "Set no oplock for inode=%p after a write operation\n",
- inode);
- cinode->oplock = 0;
- }
- return written;
- }
-
- written = cifs_get_writer(cinode);
- if (written)
- return written;
-
- written = generic_file_write_iter(iocb, from);
-
- if (CIFS_CACHE_WRITE(CIFS_I(inode)))
- goto out;
-
- rc = filemap_fdatawrite(inode->i_mapping);
- if (rc)
- cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
- rc, inode);
-
-out:
- cifs_put_writer(cinode);
- return written;
-}
-
static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
{
struct cifsFileInfo *cfile = file->private_data;
@@ -1342,6 +1291,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
+ if (fend > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = fend + 1;
/* Discard all the folios that overlap the destination region. */
cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
@@ -1360,6 +1311,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
+ if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = new_size;
}
/* force revalidate of size and timestamps of target file now
@@ -1451,6 +1404,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
+ if (fend > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = fend + 1;
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
@@ -1567,8 +1522,8 @@ const struct file_operations cifs_file_strict_ops = {
};
const struct file_operations cifs_file_direct_ops = {
- .read_iter = cifs_direct_readv,
- .write_iter = cifs_direct_writev,
+ .read_iter = netfs_unbuffered_read_iter,
+ .write_iter = netfs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1623,8 +1578,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
};
const struct file_operations cifs_file_direct_nobrl_ops = {
- .read_iter = cifs_direct_readv,
- .write_iter = cifs_direct_writev,
+ .read_iter = netfs_unbuffered_read_iter,
+ .write_iter = netfs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -1799,6 +1754,48 @@ static void destroy_mids(void)
kmem_cache_destroy(cifs_mid_cachep);
}
+static int cifs_init_netfs(void)
+{
+ cifs_io_request_cachep =
+ kmem_cache_create("cifs_io_request",
+ sizeof(struct cifs_io_request), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cifs_io_request_cachep)
+ goto nomem_req;
+
+ if (mempool_init_slab_pool(&cifs_io_request_pool, 100, cifs_io_request_cachep) < 0)
+ goto nomem_reqpool;
+
+ cifs_io_subrequest_cachep =
+ kmem_cache_create("cifs_io_subrequest",
+ sizeof(struct cifs_io_subrequest), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cifs_io_subrequest_cachep)
+ goto nomem_subreq;
+
+ if (mempool_init_slab_pool(&cifs_io_subrequest_pool, 100, cifs_io_subrequest_cachep) < 0)
+ goto nomem_subreqpool;
+
+ return 0;
+
+nomem_subreqpool:
+ kmem_cache_destroy(cifs_io_subrequest_cachep);
+nomem_subreq:
+ mempool_destroy(&cifs_io_request_pool);
+nomem_reqpool:
+ kmem_cache_destroy(cifs_io_request_cachep);
+nomem_req:
+ return -ENOMEM;
+}
+
+static void cifs_destroy_netfs(void)
+{
+ mempool_destroy(&cifs_io_subrequest_pool);
+ kmem_cache_destroy(cifs_io_subrequest_cachep);
+ mempool_destroy(&cifs_io_request_pool);
+ kmem_cache_destroy(cifs_io_request_cachep);
+}
+
static int __init
init_cifs(void)
{
@@ -1903,10 +1900,14 @@ init_cifs(void)
if (rc)
goto out_destroy_deferredclose_wq;
- rc = init_mids();
+ rc = cifs_init_netfs();
if (rc)
goto out_destroy_inodecache;
+ rc = init_mids();
+ if (rc)
+ goto out_destroy_netfs;
+
rc = cifs_init_request_bufs();
if (rc)
goto out_destroy_mids;
@@ -1961,6 +1962,8 @@ out_destroy_request_bufs:
cifs_destroy_request_bufs();
out_destroy_mids:
destroy_mids();
+out_destroy_netfs:
+ cifs_destroy_netfs();
out_destroy_inodecache:
cifs_destroy_inodecache();
out_destroy_deferredclose_wq:
@@ -1999,6 +2002,7 @@ exit_cifs(void)
#endif
cifs_destroy_request_bufs();
destroy_mids();
+ cifs_destroy_netfs();
cifs_destroy_inodecache();
destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index ca55d01117c8..87310f05d397 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -69,7 +69,6 @@ extern int cifs_revalidate_file_attr(struct file *filp);
extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);
extern int cifs_revalidate_dentry(struct dentry *);
-extern int cifs_invalidate_mapping(struct inode *inode);
extern int cifs_revalidate_mapping(struct inode *inode);
extern int cifs_zap_mapping(struct inode *inode);
extern int cifs_getattr(struct mnt_idmap *, const struct path *,
@@ -85,6 +84,7 @@ extern const struct inode_operations cifs_namespace_inode_operations;
/* Functions related to files and directories */
+extern const struct netfs_request_ops cifs_req_ops;
extern const struct file_operations cifs_file_ops;
extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
@@ -94,12 +94,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
-extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
-extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
-extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter);
extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
@@ -110,9 +108,6 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
-extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len);
-extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len);
-extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len);
/* Functions related to dir entries */
extern const struct dentry_operations cifs_dentry_ops;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 6ff35570db81..65574e69ba4f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -268,8 +268,7 @@ struct dfs_info3_param;
struct cifs_fattr;
struct smb3_fs_context;
struct cifs_fid;
-struct cifs_readdata;
-struct cifs_writedata;
+struct cifs_io_subrequest;
struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
@@ -450,10 +449,9 @@ struct smb_version_operations {
/* send a flush request to the server */
int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
/* async read from the server */
- int (*async_readv)(struct cifs_readdata *);
+ int (*async_readv)(struct cifs_io_subrequest *);
/* async write to the server */
- int (*async_writev)(struct cifs_writedata *,
- void (*release)(struct kref *));
+ void (*async_writev)(struct cifs_io_subrequest *);
/* sync read from the server */
int (*sync_read)(const unsigned int, struct cifs_fid *,
struct cifs_io_parms *, unsigned int *, char **,
@@ -548,8 +546,8 @@ struct smb_version_operations {
/* writepages retry size */
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
- int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, struct cifs_credits *);
+ int (*wait_mtu_credits)(struct TCP_Server_Info *, size_t,
+ size_t *, struct cifs_credits *);
/* adjust previously taken mtu credits to request size */
int (*adjust_credits)(struct TCP_Server_Info *server,
struct cifs_credits *credits,
@@ -883,11 +881,12 @@ add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
static inline void
add_credits_and_wake_if(struct TCP_Server_Info *server,
- const struct cifs_credits *credits, const int optype)
+ struct cifs_credits *credits, const int optype)
{
if (credits->value) {
server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
+ credits->value = 0;
}
}
@@ -1492,50 +1491,30 @@ struct cifs_aio_ctx {
bool direct_io;
};
-/* asynchronous read support */
-struct cifs_readdata {
- struct kref refcount;
- struct list_head list;
- struct completion done;
+struct cifs_io_request {
+ struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
- struct address_space *mapping;
- struct cifs_aio_ctx *ctx;
- __u64 offset;
- ssize_t got_bytes;
- unsigned int bytes;
- pid_t pid;
- int result;
- struct work_struct work;
- struct iov_iter iter;
- struct kvec iov[2];
- struct TCP_Server_Info *server;
-#ifdef CONFIG_CIFS_SMB_DIRECT
- struct smbd_mr *mr;
-#endif
- struct cifs_credits credits;
};
-/* asynchronous write support */
-struct cifs_writedata {
- struct kref refcount;
- struct list_head list;
- struct completion done;
- enum writeback_sync_modes sync_mode;
- struct work_struct work;
- struct cifsFileInfo *cfile;
- struct cifs_aio_ctx *ctx;
- struct iov_iter iter;
- struct bio_vec *bv;
- __u64 offset;
+/* asynchronous read support */
+struct cifs_io_subrequest {
+ union {
+ struct netfs_io_subrequest subreq;
+ struct netfs_io_request *rreq;
+ struct cifs_io_request *req;
+ };
+ ssize_t got_bytes;
pid_t pid;
- unsigned int bytes;
+ unsigned int xid;
int result;
+ bool have_xid;
+ bool replay;
+ struct kvec iov[2];
struct TCP_Server_Info *server;
#ifdef CONFIG_CIFS_SMB_DIRECT
struct smbd_mr *mr;
#endif
struct cifs_credits credits;
- bool replay;
};
/*
@@ -2115,6 +2094,8 @@ extern __u32 cifs_lock_secret;
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
+extern mempool_t cifs_io_request_pool;
+extern mempool_t cifs_io_subrequest_pool;
/* Operations for different SMB versions */
#define SMB1_VERSION_STRING "1.0"
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index fbc358c09da3..c15bb5ee7eb7 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -121,7 +121,7 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
- unsigned int size, unsigned int *num,
+ size_t size, size_t *num,
struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
@@ -148,6 +148,8 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
bool from_readdir);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
+ bool was_async);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
int flags,
@@ -599,15 +601,11 @@ void __cifs_put_smb_ses(struct cifs_ses *ses);
extern struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx);
-void cifs_readdata_release(struct kref *refcount);
-int cifs_async_readv(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_io_subrequest *rdata);
int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
-int cifs_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref));
+void cifs_async_writev(struct cifs_io_subrequest *wdata);
void cifs_writev_complete(struct work_struct *work);
-struct cifs_writedata *cifs_writedata_alloc(work_func_t complete);
-void cifs_writedata_release(struct kref *refcount);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 23b5709ddc31..25e9ab947c17 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -24,6 +24,8 @@
#include <linux/swap.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uaccess.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "cifspdu.h"
#include "cifsfs.h"
#include "cifsglob.h"
@@ -1262,18 +1264,17 @@ openRetry:
static void
cifs_readv_callback(struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
- .rq_iter_size = iov_iter_count(&rdata->iter),
- .rq_iter = rdata->iter };
+ .rq_iter = rdata->subreq.io_iter };
struct cifs_credits credits = { .value = 1, .instance = 0 };
- cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
- rdata->bytes);
+ rdata->subreq.len);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -1305,30 +1306,36 @@ cifs_readv_callback(struct mid_q_entry *mid)
rdata->result = -EIO;
}
- queue_work(cifsiod_wq, &rdata->work);
+ if (rdata->result == 0 || rdata->result == -EAGAIN)
+ iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
+ rdata->credits.value = 0;
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result,
+ false);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
int
-cifs_async_readv(struct cifs_readdata *rdata)
+cifs_async_readv(struct cifs_io_subrequest *rdata)
{
int rc;
READ_REQ *smb = NULL;
int wct;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2 };
- cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
- __func__, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
+ __func__, rdata->subreq.start, rdata->subreq.len);
if (tcon->ses->capabilities & CAP_LARGE_FILES)
wct = 12;
else {
wct = 10; /* old style read */
- if ((rdata->offset >> 32) > 0) {
+ if ((rdata->subreq.start >> 32) > 0) {
/* can not handle this big offset for old */
return -EIO;
}
@@ -1342,13 +1349,13 @@ cifs_async_readv(struct cifs_readdata *rdata)
smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
- smb->Fid = rdata->cfile->fid.netfid;
- smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+ smb->Fid = rdata->req->cfile->fid.netfid;
+ smb->OffsetLow = cpu_to_le32(rdata->subreq.start & 0xFFFFFFFF);
if (wct == 12)
- smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+ smb->OffsetHigh = cpu_to_le32(rdata->subreq.start >> 32);
smb->Remaining = 0;
- smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
- smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+ smb->MaxCount = cpu_to_le16(rdata->subreq.len & 0xFFFF);
+ smb->MaxCountHigh = cpu_to_le32(rdata->subreq.len >> 16);
if (wct == 12)
smb->ByteCount = 0;
else {
@@ -1364,15 +1371,11 @@ cifs_async_readv(struct cifs_readdata *rdata)
rdata->iov[1].iov_base = (char *)smb + 4;
rdata->iov[1].iov_len = get_rfc1002_length(smb);
- kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
- else
- kref_put(&rdata->refcount, cifs_readdata_release);
-
cifs_small_buf_release(smb);
return rc;
}
@@ -1615,16 +1618,17 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
static void
cifs_writev_callback(struct mid_q_entry *mid)
{
- struct cifs_writedata *wdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
- unsigned int written;
+ struct cifs_io_subrequest *wdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
struct cifs_credits credits = { .value = 1, .instance = 0 };
+ ssize_t result;
+ size_t written;
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
- if (wdata->result != 0)
+ result = cifs_check_receive(mid, tcon->ses->server, 0);
+ if (result != 0)
break;
written = le16_to_cpu(smb->CountHigh);
@@ -1636,37 +1640,37 @@ cifs_writev_callback(struct mid_q_entry *mid)
* client. OS/2 servers are known to set incorrect
* CountHigh values.
*/
- if (written > wdata->bytes)
+ if (written > wdata->subreq.len)
written &= 0xFFFF;
- if (written < wdata->bytes)
- wdata->result = -ENOSPC;
+ if (written < wdata->subreq.len)
+ result = -ENOSPC;
else
- wdata->bytes = written;
+ result = written;
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
- wdata->result = -EAGAIN;
+ result = -EAGAIN;
break;
default:
- wdata->result = -EIO;
+ result = -EIO;
break;
}
- queue_work(cifsiod_wq, &wdata->work);
+ wdata->credits.value = 0;
+ cifs_write_subrequest_terminated(wdata, result, true);
release_mid(mid);
add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
-int
-cifs_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref))
+void
+cifs_async_writev(struct cifs_io_subrequest *wdata)
{
int rc = -EACCES;
WRITE_REQ *smb = NULL;
int wct;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct kvec iov[2];
struct smb_rqst rqst = { };
@@ -1674,9 +1678,10 @@ cifs_async_writev(struct cifs_writedata *wdata,
wct = 14;
} else {
wct = 12;
- if (wdata->offset >> 32 > 0) {
+ if (wdata->subreq.start >> 32 > 0) {
/* can not handle big offset for old srv */
- return -EIO;
+ rc = -EIO;
+ goto out;
}
}
@@ -1688,10 +1693,10 @@ cifs_async_writev(struct cifs_writedata *wdata,
smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
- smb->Fid = wdata->cfile->fid.netfid;
- smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+ smb->Fid = wdata->req->cfile->fid.netfid;
+ smb->OffsetLow = cpu_to_le32(wdata->subreq.start & 0xFFFFFFFF);
if (wct == 14)
- smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+ smb->OffsetHigh = cpu_to_le32(wdata->subreq.start >> 32);
smb->Reserved = 0xFFFFFFFF;
smb->WriteMode = 0;
smb->Remaining = 0;
@@ -1707,39 +1712,40 @@ cifs_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
- rqst.rq_iter = wdata->iter;
- rqst.rq_iter_size = iov_iter_count(&wdata->iter);
+ rqst.rq_iter = wdata->subreq.io_iter;
+ rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter);
- cifs_dbg(FYI, "async write at %llu %u bytes\n",
- wdata->offset, wdata->bytes);
+ cifs_dbg(FYI, "async write at %llu %zu bytes\n",
+ wdata->subreq.start, wdata->subreq.len);
- smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
- smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+ smb->DataLengthLow = cpu_to_le16(wdata->subreq.len & 0xFFFF);
+ smb->DataLengthHigh = cpu_to_le16(wdata->subreq.len >> 16);
if (wct == 14) {
- inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
- put_bcc(wdata->bytes + 1, &smb->hdr);
+ inc_rfc1001_len(&smb->hdr, wdata->subreq.len + 1);
+ put_bcc(wdata->subreq.len + 1, &smb->hdr);
} else {
/* wct == 12 */
struct smb_com_writex_req *smbw =
(struct smb_com_writex_req *)smb;
- inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
- put_bcc(wdata->bytes + 5, &smbw->hdr);
+ inc_rfc1001_len(&smbw->hdr, wdata->subreq.len + 5);
+ put_bcc(wdata->subreq.len + 5, &smbw->hdr);
iov[1].iov_len += 4; /* pad bigger by four bytes */
}
- kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
cifs_writev_callback, NULL, wdata, 0, NULL);
-
+ /* Can't touch wdata if rc == 0 */
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
- else
- kref_put(&wdata->refcount, release);
async_writev_out:
cifs_small_buf_release(smb);
- return rc;
+out:
+ if (rc) {
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, false);
+ }
}
int
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9be37d0fe724..4c981ce89f8a 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -36,133 +36,323 @@
#include "fs_context.h"
#include "cifs_ioctl.h"
#include "cached_dir.h"
+#include <trace/events/netfs.h>
+
+static int cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush);
/*
- * Remove the dirty flags from a span of pages.
+ * Prepare a subrequest to upload to the server. We need to allocate credits
+ * so that we know the maximum amount of data that we can include in it.
*/
-static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct cifs_io_subrequest *wdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = wdata->req;
+ struct TCP_Server_Info *server;
+ struct cifsFileInfo *open_file = req->cfile;
+ size_t wsize = req->rreq.wsize;
+ int rc;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (!wdata->have_xid) {
+ wdata->xid = get_xid();
+ wdata->have_xid = true;
+ }
- rcu_read_lock();
+ server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
+ wdata->server = server;
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) {
- if (xas_retry(&xas, folio))
- continue;
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_unlock(folio);
- rcu_read_lock();
+retry:
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc < 0) {
+ if (rc == -EAGAIN)
+ goto retry;
+ subreq->error = rc;
+ return netfs_prepare_write_failed(subreq);
+ }
+ }
+
+ rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len,
+ &wdata->credits);
+ if (rc < 0) {
+ subreq->error = rc;
+ return netfs_prepare_write_failed(subreq);
}
- rcu_read_unlock();
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
}
/*
- * Completion of write to server.
+ * Issue a subrequest to upload to the server.
*/
-void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_issue_write(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct cifs_io_subrequest *wdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_sb_info *sbi = CIFS_SB(subreq->rreq->inode->i_sb);
+ int rc;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (cifs_forced_shutdown(sbi)) {
+ rc = -EIO;
+ goto fail;
+ }
- if (!len)
- return;
+ rc = adjust_credits(wdata->server, &wdata->credits, wdata->subreq.len);
+ if (rc)
+ goto fail;
- rcu_read_lock();
+ rc = -EAGAIN;
+ if (wdata->req->cfile->invalidHandle)
+ goto fail;
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (xas_retry(&xas, folio))
- continue;
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+ wdata->server->ops->async_writev(wdata);
+out:
+ return;
+
+fail:
+ if (rc == -EAGAIN)
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ else
+ trace_netfs_sreq(subreq, netfs_sreq_trace_fail);
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, false);
+ goto out;
+}
+
+/*
+ * Split the read up according to how many credits we can get for each piece.
+ * It's okay to sleep here if we need to wait for more credit to become
+ * available.
+ *
+ * We also choose the server and allocate an operation ID to be cleaned up
+ * later.
+ */
+static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct TCP_Server_Info *server;
+ struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ size_t rsize = 0;
+ int rc;
+
+ rdata->xid = get_xid();
+ rdata->have_xid = true;
+
+ server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
+ rdata->server = server;
+
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
+ cifs_sb->ctx);
- folio_detach_private(folio);
- folio_end_writeback(folio);
+
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize,
+ &rdata->credits);
+ if (rc) {
+ subreq->error = rc;
+ return false;
}
- rcu_read_unlock();
+ subreq->len = min_t(size_t, subreq->len, rsize);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
+ return true;
}
/*
- * Failure of write to server.
+ * Issue a read operation on behalf of the netfs helper functions. We're asked
+ * to make a read of a certain size at a point in the file. We are permitted
+ * to only read a portion of that, but as long as we read something, the netfs
+ * helper will call us again so that we can issue another read.
*/
-void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+ struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ pid_t pid;
+ int rc = 0;
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ pid = req->cfile->pid;
+ else
+ pid = current->tgid; // Ummm... This may be a workqueue
- if (!len)
- return;
+ cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
+ __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
+ subreq->transferred, subreq->len);
- rcu_read_lock();
+ if (req->cfile->invalidHandle) {
+ do {
+ rc = cifs_reopen_file(req->cfile, true);
+ } while (rc == -EAGAIN);
+ if (rc)
+ goto out;
+ }
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (xas_retry(&xas, folio))
- continue;
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ rdata->pid = pid;
- folio_set_error(folio);
- folio_end_writeback(folio);
+ rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len);
+ if (!rc) {
+ if (rdata->req->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = rdata->server->ops->async_readv(rdata);
}
- rcu_read_unlock();
+out:
+ if (rc)
+ netfs_subreq_terminated(subreq, rc, false);
}
/*
- * Redirty pages after a temporary failure.
+ * Writeback calls this when it finds a folio that needs uploading. This isn't
+ * called if writeback only has copy-to-cache to deal with.
*/
-void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len)
+static void cifs_begin_writeback(struct netfs_io_request *wreq)
{
- struct address_space *mapping = inode->i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+ struct cifs_io_request *req = container_of(wreq, struct cifs_io_request, rreq);
+ int ret;
- if (!len)
+ ret = cifs_get_writable_file(CIFS_I(wreq->inode), FIND_WR_ANY, &req->cfile);
+ if (ret) {
+ cifs_dbg(VFS, "No writable handle in writepages ret=%d\n", ret);
return;
+ }
- rcu_read_lock();
+ wreq->io_streams[0].avail = true;
+}
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio->index, end);
- continue;
- }
+/*
+ * Initialise a request.
+ */
+static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+ struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+ struct cifsFileInfo *open_file = NULL;
- filemap_dirty_folio(folio->mapping, folio);
- folio_end_writeback(folio);
+ rreq->rsize = cifs_sb->ctx->rsize;
+ rreq->wsize = cifs_sb->ctx->wsize;
+
+ if (file) {
+ open_file = file->private_data;
+ rreq->netfs_priv = file->private_data;
+ req->cfile = cifsFileInfo_get(open_file);
+ } else if (rreq->origin != NETFS_WRITEBACK) {
+ WARN_ON_ONCE(1);
+ return -EIO;
}
- rcu_read_unlock();
+ return 0;
}
/*
+ * Expand the size of a readahead to the size of the rsize, if at least as
+ * large as a page, allowing for the possibility that rsize is not pow-2
+ * aligned.
+ */
+static void cifs_expand_readahead(struct netfs_io_request *rreq)
+{
+ unsigned int rsize = rreq->rsize;
+ loff_t misalignment, i_size = i_size_read(rreq->inode);
+
+ if (rsize < PAGE_SIZE)
+ return;
+
+ if (rsize < INT_MAX)
+ rsize = roundup_pow_of_two(rsize);
+ else
+ rsize = ((unsigned int)INT_MAX + 1) / 2;
+
+ misalignment = rreq->start & (rsize - 1);
+ if (misalignment) {
+ rreq->start -= misalignment;
+ rreq->len += misalignment;
+ }
+
+ rreq->len = round_up(rreq->len, rsize);
+ if (rreq->start < i_size && rreq->len > i_size - rreq->start)
+ rreq->len = i_size - rreq->start;
+}
+
+/*
+ * Completion of a request operation.
+ */
+static void cifs_rreq_done(struct netfs_io_request *rreq)
+{
+ struct timespec64 atime, mtime;
+ struct inode *inode = rreq->inode;
+
+ /* we do not want atime to be less than mtime, it broke some apps */
+ atime = inode_set_atime_to_ts(inode, current_time(inode));
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&atime, &mtime))
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
+}
+
+static void cifs_post_modify(struct inode *inode)
+{
+ /* Indication to update ctime and mtime as close is deferred */
+ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
+}
+
+static void cifs_free_request(struct netfs_io_request *rreq)
+{
+ struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+
+ if (req->cfile)
+ cifsFileInfo_put(req->cfile);
+}
+
+static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
+{
+ struct cifs_io_subrequest *rdata =
+ container_of(subreq, struct cifs_io_subrequest, subreq);
+ int rc = subreq->error;
+
+ if (rdata->subreq.source == NETFS_DOWNLOAD_FROM_SERVER) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr) {
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
+ }
+
+ add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
+ if (rdata->have_xid)
+ free_xid(rdata->xid);
+}
+
+const struct netfs_request_ops cifs_req_ops = {
+ .request_pool = &cifs_io_request_pool,
+ .subrequest_pool = &cifs_io_subrequest_pool,
+ .init_request = cifs_init_request,
+ .free_request = cifs_free_request,
+ .free_subrequest = cifs_free_subrequest,
+ .expand_readahead = cifs_expand_readahead,
+ .clamp_length = cifs_clamp_length,
+ .issue_read = cifs_req_issue_read,
+ .done = cifs_rreq_done,
+ .post_modify = cifs_post_modify,
+ .begin_writeback = cifs_begin_writeback,
+ .prepare_write = cifs_prepare_write,
+ .issue_write = cifs_issue_write,
+};
+
+/*
* Mark as invalid, all open files on tree connections since they
* were closed when session to server was lost.
*/
@@ -2207,102 +2397,20 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
return rc;
}
-/*
- * update the file size (if needed) after a write. Should be called with
- * the inode->i_lock held
- */
-void
-cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
- unsigned int bytes_written)
-{
- loff_t end_of_write = offset + bytes_written;
-
- if (end_of_write > cifsi->netfs.remote_i_size)
- netfs_resize_file(&cifsi->netfs, end_of_write, true);
-}
-
-static ssize_t
-cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
- size_t write_size, loff_t *offset)
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
+ bool was_async)
{
- int rc = 0;
- unsigned int bytes_written = 0;
- unsigned int total_written;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- unsigned int xid;
- struct dentry *dentry = open_file->dentry;
- struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry));
- struct cifs_io_parms io_parms = {0};
+ struct netfs_io_request *wreq = wdata->rreq;
+ loff_t new_server_eof;
- cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
- write_size, *offset, dentry);
+ if (result > 0) {
+ new_server_eof = wdata->subreq.start + wdata->subreq.transferred + result;
- tcon = tlink_tcon(open_file->tlink);
- server = tcon->ses->server;
-
- if (!server->ops->sync_write)
- return -ENOSYS;
-
- xid = get_xid();
-
- for (total_written = 0; write_size > total_written;
- total_written += bytes_written) {
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- struct kvec iov[2];
- unsigned int len;
-
- if (open_file->invalidHandle) {
- /* we could deadlock if we called
- filemap_fdatawait from here so tell
- reopen_file not to flush data to
- server now */
- rc = cifs_reopen_file(open_file, false);
- if (rc != 0)
- break;
- }
-
- len = min(server->ops->wp_retry_size(d_inode(dentry)),
- (unsigned int)write_size - total_written);
- /* iov[0] is reserved for smb header */
- iov[1].iov_base = (char *)write_data + total_written;
- iov[1].iov_len = len;
- io_parms.pid = pid;
- io_parms.tcon = tcon;
- io_parms.offset = *offset;
- io_parms.length = len;
- rc = server->ops->sync_write(xid, &open_file->fid,
- &io_parms, &bytes_written, iov, 1);
- }
- if (rc || (bytes_written == 0)) {
- if (total_written)
- break;
- else {
- free_xid(xid);
- return rc;
- }
- } else {
- spin_lock(&d_inode(dentry)->i_lock);
- cifs_update_eof(cifsi, *offset, bytes_written);
- spin_unlock(&d_inode(dentry)->i_lock);
- *offset += bytes_written;
- }
+ if (new_server_eof > netfs_inode(wreq->inode)->remote_i_size)
+ netfs_resize_file(netfs_inode(wreq->inode), new_server_eof, true);
}
- cifs_stats_bytes_written(tcon, total_written);
-
- if (total_written > 0) {
- spin_lock(&d_inode(dentry)->i_lock);
- if (*offset > d_inode(dentry)->i_size) {
- i_size_write(d_inode(dentry), *offset);
- d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9;
- }
- spin_unlock(&d_inode(dentry)->i_lock);
- }
- mark_inode_dirty_sync(d_inode(dentry));
- free_xid(xid);
- return total_written;
+ netfs_write_subrequest_terminated(&wdata->subreq, result, was_async);
}
struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
@@ -2509,737 +2617,9 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
return -ENOENT;
}
-void
-cifs_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
-
- if (wdata->cfile)
- cifsFileInfo_put(wdata->cfile);
-
- kfree(wdata);
-}
-
-/*
- * Write failed with a retryable error. Resend the write request. It's also
- * possible that the page was redirtied so re-clean the page.
- */
-static void
-cifs_writev_requeue(struct cifs_writedata *wdata)
-{
- int rc = 0;
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct TCP_Server_Info *server;
- unsigned int rest_len = wdata->bytes;
- loff_t fpos = wdata->offset;
-
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- do {
- struct cifs_writedata *wdata2;
- unsigned int wsize, cur_len;
-
- wsize = server->ops->wp_retry_size(inode);
- if (wsize < rest_len) {
- if (wsize < PAGE_SIZE) {
- rc = -EOPNOTSUPP;
- break;
- }
- cur_len = min(round_down(wsize, PAGE_SIZE), rest_len);
- } else {
- cur_len = rest_len;
- }
-
- wdata2 = cifs_writedata_alloc(cifs_writev_complete);
- if (!wdata2) {
- rc = -ENOMEM;
- break;
- }
-
- wdata2->sync_mode = wdata->sync_mode;
- wdata2->offset = fpos;
- wdata2->bytes = cur_len;
- wdata2->iter = wdata->iter;
-
- iov_iter_advance(&wdata2->iter, fpos - wdata->offset);
- iov_iter_truncate(&wdata2->iter, wdata2->bytes);
-
- if (iov_iter_is_xarray(&wdata2->iter))
- /* Check for pages having been redirtied and clean
- * them. We can do this by walking the xarray. If
- * it's not an xarray, then it's a DIO and we shouldn't
- * be mucking around with the page bits.
- */
- cifs_undirty_folios(inode, fpos, cur_len);
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
- &wdata2->cfile);
- if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
- rc);
- if (!is_retryable_error(rc))
- rc = -EBADF;
- } else {
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2,
- cifs_writedata_release);
- }
-
- kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc) {
- if (is_retryable_error(rc))
- continue;
- fpos += cur_len;
- rest_len -= cur_len;
- break;
- }
-
- fpos += cur_len;
- rest_len -= cur_len;
- } while (rest_len > 0);
-
- /* Clean up remaining pages from the original wdata */
- if (iov_iter_is_xarray(&wdata->iter))
- cifs_pages_write_failed(inode, fpos, rest_len);
-
- if (rc != 0 && !is_retryable_error(rc))
- mapping_set_error(inode->i_mapping, rc);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-void
-cifs_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
-
- if (wdata->result == 0) {
- spin_lock(&inode->i_lock);
- cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
- spin_unlock(&inode->i_lock);
- cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
- wdata->bytes);
- } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
- return cifs_writev_requeue(wdata);
-
- if (wdata->result == -EAGAIN)
- cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes);
- else if (wdata->result < 0)
- cifs_pages_write_failed(inode, wdata->offset, wdata->bytes);
- else
- cifs_pages_written_back(inode, wdata->offset, wdata->bytes);
-
- if (wdata->result != -EAGAIN)
- mapping_set_error(inode->i_mapping, wdata->result);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-struct cifs_writedata *cifs_writedata_alloc(work_func_t complete)
-{
- struct cifs_writedata *wdata;
-
- wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
- if (wdata != NULL) {
- kref_init(&wdata->refcount);
- INIT_LIST_HEAD(&wdata->list);
- init_completion(&wdata->done);
- INIT_WORK(&wdata->work, complete);
- }
- return wdata;
-}
-
-static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
-{
- struct address_space *mapping = page->mapping;
- loff_t offset = (loff_t)page->index << PAGE_SHIFT;
- char *write_data;
- int rc = -EFAULT;
- int bytes_written = 0;
- struct inode *inode;
- struct cifsFileInfo *open_file;
-
- if (!mapping || !mapping->host)
- return -EFAULT;
-
- inode = page->mapping->host;
-
- offset += (loff_t)from;
- write_data = kmap(page);
- write_data += from;
-
- if ((to > PAGE_SIZE) || (from > to)) {
- kunmap(page);
- return -EIO;
- }
-
- /* racing with truncate? */
- if (offset > mapping->host->i_size) {
- kunmap(page);
- return 0; /* don't care */
- }
-
- /* check to make sure that we are not extending the file */
- if (mapping->host->i_size - offset < (loff_t)to)
- to = (unsigned)(mapping->host->i_size - offset);
-
- rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY,
- &open_file);
- if (!rc) {
- bytes_written = cifs_write(open_file, open_file->pid,
- write_data, to - from, &offset);
- cifsFileInfo_put(open_file);
- /* Does mm or vfs already set times? */
- simple_inode_init_ts(inode);
- if ((bytes_written > 0) && (offset))
- rc = 0;
- else if (bytes_written < 0)
- rc = bytes_written;
- else
- rc = -EFAULT;
- } else {
- cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
- if (!is_retryable_error(rc))
- rc = -EIO;
- }
-
- kunmap(page);
- return rc;
-}
-
/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
+ * Flush data on a strict file.
*/
-static void cifs_extend_writeback(struct address_space *mapping,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- int max_pages,
- loff_t max_len,
- size_t *_len)
-{
- struct folio_batch batch;
- struct folio *folio;
- unsigned int nr_pages;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- bool stop = true;
- unsigned int i;
-
- folio_batch_init(&batch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
- nr_pages = folio_nr_pages(folio);
- if (nr_pages > max_pages) {
- xas_reset(xas);
- break;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- max_pages -= nr_pages;
- len = folio_size(folio);
- stop = false;
-
- index += nr_pages;
- *_count -= nr_pages;
- *_len += len;
- if (max_pages <= 0 || *_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&batch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any pages, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&batch))
- break;
-
- for (i = 0; i < folio_batch_count(&batch); i++) {
- folio = batch.folios[i];
- /* The folio should be locked, dirty and not undergoing
- * writeback from the loop above.
- */
- if (!folio_clear_dirty_for_io(folio))
- WARN_ON(1);
- folio_start_writeback(folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&batch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Write back the locked page and any subsequent non-locked dirty pages.
- */
-static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct inode *inode = mapping->host;
- struct TCP_Server_Info *server;
- struct cifs_writedata *wdata;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- struct cifsFileInfo *cfile = NULL;
- unsigned long long i_size = i_size_read(inode), max_len;
- unsigned int xid, wsize;
- size_t len = folio_size(folio);
- long count = wbc->nr_to_write;
- int rc;
-
- /* The folio should be locked, dirty and not undergoing writeback. */
- if (!folio_clear_dirty_for_io(folio))
- WARN_ON_ONCE(1);
- folio_start_writeback(folio);
-
- count -= folio_nr_pages(folio);
-
- xid = get_xid();
- server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
- if (rc) {
- cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc);
- goto err_xid;
- }
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
- &wsize, credits);
- if (rc != 0)
- goto err_close;
-
- wdata = cifs_writedata_alloc(cifs_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- goto err_uncredit;
- }
-
- wdata->sync_mode = wbc->sync_mode;
- wdata->offset = folio_pos(folio);
- wdata->pid = cfile->pid;
- wdata->credits = credits_on_stack;
- wdata->cfile = cfile;
- wdata->server = server;
- cfile = NULL;
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = wsize;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len) {
- int max_pages = INT_MAX;
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_pages = server->smbd_conn->max_frmr_depth;
-#endif
- max_pages -= folio_nr_pages(folio);
-
- if (max_pages > 0)
- cifs_extend_writeback(mapping, xas, &count, start,
- max_pages, max_len, &len);
- }
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
- wdata->bytes = len;
-
- if (start < i_size) {
- iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
- start, len);
-
- rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
- if (rc)
- goto err_wdata;
-
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = wdata->server->ops->async_writev(wdata,
- cifs_writedata_release);
- if (rc >= 0) {
- kref_put(&wdata->refcount, cifs_writedata_release);
- goto err_close;
- }
- } else {
- /* The dirty region was entirely beyond the EOF. */
- cifs_pages_written_back(inode, start, len);
- rc = 0;
- }
-
-err_wdata:
- kref_put(&wdata->refcount, cifs_writedata_release);
-err_uncredit:
- add_credits_and_wake_if(server, credits, 0);
-err_close:
- if (cfile)
- cifsFileInfo_put(cfile);
-err_xid:
- free_xid(xid);
- if (rc == 0) {
- wbc->nr_to_write = count;
- rc = len;
- } else if (is_retryable_error(rc)) {
- cifs_pages_write_redirty(inode, start, len);
- } else {
- cifs_pages_write_failed(inode, start, len);
- mapping_set_error(mapping, rc);
- }
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
- return rc;
-}
-
-/*
- * write a region of pages back to the server
- */
-static ssize_t cifs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- int skips = 0;
-
-search_again:
- /* Find the first dirty page. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_CIFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = cifs_write_back_from_locked_folio(mapping, wbc, xas, folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int cifs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = cifs_writepages_begin(mapping, wbc, &xas, _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * Write some of the pending data back to the server
- */
-static int cifs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- loff_t start, end;
- int ret;
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = cifs_writepages_region(mapping, wbc, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = cifs_writepages_region(mapping, wbc, &start, wbc->range_end);
- }
-
-out:
- return ret;
-}
-
-static int
-cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
-{
- int rc;
- unsigned int xid;
-
- xid = get_xid();
-/* BB add check for wbc flags */
- get_page(page);
- if (!PageUptodate(page))
- cifs_dbg(FYI, "ppw - page not up to date\n");
-
- /*
- * Set the "writeback" flag, and clear "dirty" in the radix tree.
- *
- * A writepage() implementation always needs to do either this,
- * or re-dirty the page with "redirty_page_for_writepage()" in
- * the case of a failure.
- *
- * Just unlocking the page will cause the radix tree tag-bits
- * to fail to update with the state of the page correctly.
- */
- set_page_writeback(page);
-retry_write:
- rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (is_retryable_error(rc)) {
- if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
- goto retry_write;
- redirty_page_for_writepage(wbc, page);
- } else if (rc != 0) {
- SetPageError(page);
- mapping_set_error(page->mapping, rc);
- } else {
- SetPageUptodate(page);
- }
- end_page_writeback(page);
- put_page(page);
- free_xid(xid);
- return rc;
-}
-
-static int cifs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- int rc;
- struct inode *inode = mapping->host;
- struct cifsFileInfo *cfile = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
- struct folio *folio = page_folio(page);
- __u32 pid;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = cfile->pid;
- else
- pid = current->tgid;
-
- cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
- page, pos, copied);
-
- if (folio_test_checked(folio)) {
- if (copied == len)
- folio_mark_uptodate(folio);
- folio_clear_checked(folio);
- } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE)
- folio_mark_uptodate(folio);
-
- if (!folio_test_uptodate(folio)) {
- char *page_data;
- unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned int xid;
-
- xid = get_xid();
- /* this is probably better than directly calling
- partialpage_write since in this function the file handle is
- known which we might as well leverage */
- /* BB check if anything else missing out of ppw
- such as updating last write time */
- page_data = kmap(page);
- rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
- /* if (rc < 0) should we set writebehind rc? */
- kunmap(page);
-
- free_xid(xid);
- } else {
- rc = copied;
- pos += copied;
- set_page_dirty(page);
- }
-
- if (rc > 0) {
- spin_lock(&inode->i_lock);
- if (pos > inode->i_size) {
- loff_t additional_blocks = (512 - 1 + copied) >> 9;
-
- i_size_write(inode, pos);
- /*
- * Estimate new allocation size based on the amount written.
- * This will be updated from server on close (and on queryinfo)
- */
- inode->i_blocks = min_t(blkcnt_t, (512 - 1 + pos) >> 9,
- inode->i_blocks + additional_blocks);
- }
- spin_unlock(&inode->i_lock);
- }
-
- unlock_page(page);
- put_page(page);
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
-
- return rc;
-}
-
int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
@@ -3294,6 +2674,9 @@ strict_fsync_exit:
return rc;
}
+/*
+ * Flush data on a non-strict data.
+ */
int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
unsigned int xid;
@@ -3360,481 +2743,6 @@ int cifs_flush(struct file *file, fl_owner_t id)
return rc;
}
-static void
-cifs_uncached_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-
- kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
- cifs_writedata_release(refcount);
-}
-
-static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);
-
-static void
-cifs_uncached_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
- spin_lock(&inode->i_lock);
- cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
- if (cifsi->netfs.remote_i_size > inode->i_size)
- i_size_write(inode, cifsi->netfs.remote_i_size);
- spin_unlock(&inode->i_lock);
-
- complete(&wdata->done);
- collect_uncached_write_data(wdata->ctx);
- /* the below call can possibly free the last ref to aio ctx */
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
-}
-
-static int
-cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
- struct cifs_aio_ctx *ctx)
-{
- unsigned int wsize;
- struct cifs_credits credits;
- int rc;
- struct TCP_Server_Info *server = wdata->server;
-
- do {
- if (wdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(wdata->cfile, false);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
-
- /*
- * Wait for credits to resend this wdata.
- * Note: we are attempting to resend the whole wdata not in
- * segments
- */
- do {
- rc = server->ops->wait_mtu_credits(server, wdata->bytes,
- &wsize, &credits);
- if (rc)
- goto fail;
-
- if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (wsize < wdata->bytes);
- wdata->credits = credits;
-
- rc = adjust_credits(server, &wdata->credits, wdata->bytes);
-
- if (!rc) {
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else {
- wdata->replay = true;
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- wdata->mr->need_invalidate = true;
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- }
- }
-
- /* If the write was successfully sent, we are done */
- if (!rc) {
- list_add_tail(&wdata->list, wdata_list);
- return 0;
- }
-
- /* Roll back credits and retry if needed */
- add_credits_and_wake_if(server, &wdata->credits, 0);
- } while (rc == -EAGAIN);
-
-fail:
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
- return rc;
-}
-
-/*
- * Select span of a bvec iterator we're going to use. Limit it by both maximum
- * size and maximum number of segments.
- */
-static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size,
- size_t max_segs, unsigned int *_nsegs)
-{
- const struct bio_vec *bvecs = iter->bvec;
- unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
- size_t len, span = 0, n = iter->count;
- size_t skip = iter->iov_offset;
-
- if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0)
- return 0;
-
- while (n && ix < nbv && skip) {
- len = bvecs[ix].bv_len;
- if (skip < len)
- break;
- skip -= len;
- n -= len;
- ix++;
- }
-
- while (n && ix < nbv) {
- len = min3(n, bvecs[ix].bv_len - skip, max_size);
- span += len;
- max_size -= len;
- nsegs++;
- ix++;
- if (max_size == 0 || nsegs >= max_segs)
- break;
- skip = 0;
- n -= len;
- }
-
- *_nsegs = nsegs;
- return span;
-}
-
-static int
-cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from,
- struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
- struct cifs_aio_ctx *ctx)
-{
- int rc = 0;
- size_t cur_len, max_len;
- struct cifs_writedata *wdata;
- pid_t pid;
- struct TCP_Server_Info *server;
- unsigned int xid, max_segs = INT_MAX;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
- xid = get_xid();
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_segs = server->smbd_conn->max_frmr_depth;
-#endif
-
- do {
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- unsigned int wsize, nsegs = 0;
-
- if (signal_pending(current)) {
- rc = -EINTR;
- break;
- }
-
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, false);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
- &wsize, credits);
- if (rc)
- break;
-
- max_len = min_t(const size_t, len, wsize);
- if (!max_len) {
- rc = -EAGAIN;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs);
- cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n",
- cur_len, max_len, nsegs, from->nr_segs, max_segs);
- if (cur_len == 0) {
- rc = -EIO;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- wdata = cifs_writedata_alloc(cifs_uncached_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- wdata->sync_mode = WB_SYNC_ALL;
- wdata->offset = (__u64)fpos;
- wdata->cfile = cifsFileInfo_get(open_file);
- wdata->server = server;
- wdata->pid = pid;
- wdata->bytes = cur_len;
- wdata->credits = credits_on_stack;
- wdata->iter = *from;
- wdata->ctx = ctx;
- kref_get(&ctx->refcount);
-
- iov_iter_truncate(&wdata->iter, cur_len);
-
- rc = adjust_credits(server, &wdata->credits, wdata->bytes);
-
- if (!rc) {
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- }
-
- if (rc) {
- add_credits_and_wake_if(server, &wdata->credits, 0);
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
- if (rc == -EAGAIN)
- continue;
- break;
- }
-
- list_add_tail(&wdata->list, wdata_list);
- iov_iter_advance(from, cur_len);
- fpos += cur_len;
- len -= cur_len;
- } while (len > 0);
-
- free_xid(xid);
- return rc;
-}
-
-static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
-{
- struct cifs_writedata *wdata, *tmp;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct dentry *dentry = ctx->cfile->dentry;
- ssize_t rc;
-
- tcon = tlink_tcon(ctx->cfile->tlink);
- cifs_sb = CIFS_SB(dentry->d_sb);
-
- mutex_lock(&ctx->aio_mutex);
-
- if (list_empty(&ctx->list)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- rc = ctx->rc;
- /*
- * Wait for and collect replies for any successful sends in order of
- * increasing offset. Once an error is hit, then return without waiting
- * for any more replies.
- */
-restart_loop:
- list_for_each_entry_safe(wdata, tmp, &ctx->list, list) {
- if (!rc) {
- if (!try_wait_for_completion(&wdata->done)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- if (wdata->result)
- rc = wdata->result;
- else
- ctx->total_len += wdata->bytes;
-
- /* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- struct list_head tmp_list;
- struct iov_iter tmp_from = ctx->iter;
-
- INIT_LIST_HEAD(&tmp_list);
- list_del_init(&wdata->list);
-
- if (ctx->direct_io)
- rc = cifs_resend_wdata(
- wdata, &tmp_list, ctx);
- else {
- iov_iter_advance(&tmp_from,
- wdata->offset - ctx->pos);
-
- rc = cifs_write_from_iter(wdata->offset,
- wdata->bytes, &tmp_from,
- ctx->cfile, cifs_sb, &tmp_list,
- ctx);
-
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
- }
-
- list_splice(&tmp_list, &ctx->list);
- goto restart_loop;
- }
- }
- list_del_init(&wdata->list);
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
- }
-
- cifs_stats_bytes_written(tcon, ctx->total_len);
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
-
- ctx->rc = (rc == 0) ? ctx->total_len : rc;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
- else
- complete(&ctx->done);
-}
-
-static ssize_t __cifs_writev(
- struct kiocb *iocb, struct iov_iter *from, bool direct)
-{
- struct file *file = iocb->ki_filp;
- ssize_t total_written = 0;
- struct cifsFileInfo *cfile;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct cifs_aio_ctx *ctx;
- int rc;
-
- rc = generic_write_checks(iocb, from);
- if (rc <= 0)
- return rc;
-
- cifs_sb = CIFS_FILE_SB(file);
- cfile = file->private_data;
- tcon = tlink_tcon(cfile->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
-
- ctx = cifs_aio_ctx_alloc();
- if (!ctx)
- return -ENOMEM;
-
- ctx->cfile = cifsFileInfo_get(cfile);
-
- if (!is_sync_kiocb(iocb))
- ctx->iocb = iocb;
-
- ctx->pos = iocb->ki_pos;
- ctx->direct_io = direct;
- ctx->nr_pinned_pages = 0;
-
- if (user_backed_iter(from)) {
- /*
- * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
- * they contain references to the calling process's virtual
- * memory layout which won't be available in an async worker
- * thread. This also takes a pin on every folio involved.
- */
- rc = netfs_extract_user_iter(from, iov_iter_count(from),
- &ctx->iter, 0);
- if (rc < 0) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- ctx->nr_pinned_pages = rc;
- ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(from);
- } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) &&
- !is_sync_kiocb(iocb)) {
- /*
- * If the op is asynchronous, we need to copy the list attached
- * to a BVEC/KVEC-type iterator, but we assume that the storage
- * will be pinned by the caller; in any case, we may or may not
- * be able to pin the pages, so we don't try.
- */
- ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL);
- if (!ctx->bv) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -ENOMEM;
- }
- } else {
- /*
- * Otherwise, we just pass the iterator down as-is and rely on
- * the caller to make sure the pages referred to by the
- * iterator don't evaporate.
- */
- ctx->iter = *from;
- }
-
- ctx->len = iov_iter_count(&ctx->iter);
-
- /* grab a lock here due to read response handlers can access ctx */
- mutex_lock(&ctx->aio_mutex);
-
- rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter,
- cfile, cifs_sb, &ctx->list, ctx);
-
- /*
- * If at least one write was successfully sent, then discard any rc
- * value from the later writes. If the other write succeeds, then
- * we'll end up returning whatever was written. If it fails, then
- * we'll get a new rc value from that.
- */
- if (!list_empty(&ctx->list))
- rc = 0;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- if (!is_sync_kiocb(iocb)) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EIOCBQUEUED;
- }
-
- rc = wait_for_completion_killable(&ctx->done);
- if (rc) {
- mutex_lock(&ctx->aio_mutex);
- ctx->rc = rc = -EINTR;
- total_written = ctx->total_len;
- mutex_unlock(&ctx->aio_mutex);
- } else {
- rc = ctx->rc;
- total_written = ctx->total_len;
- }
-
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
-
- if (unlikely(!total_written))
- return rc;
-
- iocb->ki_pos += total_written;
- return total_written;
-}
-
-ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
-
- cifs_revalidate_mapping(file->f_inode);
- return __cifs_writev(iocb, from, true);
-}
-
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
-{
- return __cifs_writev(iocb, from, false);
-}
-
static ssize_t
cifs_writev(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3845,7 +2753,10 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc;
- inode_lock(inode);
+ rc = netfs_start_io_write(inode);
+ if (rc < 0)
+ return rc;
+
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
@@ -3859,13 +2770,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, 0,
NULL, CIFS_WRITE_OP))
- rc = __generic_file_write_iter(iocb, from);
+ rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
else
rc = -EACCES;
out:
up_read(&cinode->lock_sem);
- inode_unlock(inode);
-
+ netfs_end_io_write(inode);
if (rc > 0)
rc = generic_write_sync(iocb, rc);
return rc;
@@ -3888,9 +2798,9 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
if (CIFS_CACHE_WRITE(cinode)) {
if (cap_unix(tcon->ses) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
- && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
- written = generic_file_write_iter(iocb, from);
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ written = netfs_file_write_iter(iocb, from);
goto out;
}
written = cifs_writev(iocb, from);
@@ -3902,7 +2812,7 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* affected pages because it may cause a error with mandatory locks on
* these pages but not on the region from pos to ppos+len-1.
*/
- written = cifs_user_writev(iocb, from);
+ written = netfs_file_write_iter(iocb, from);
if (CIFS_CACHE_READ(cinode)) {
/*
* We have read level caching and we have just sent a write
@@ -3921,449 +2831,55 @@ out:
return written;
}
-static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete)
-{
- struct cifs_readdata *rdata;
-
- rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
- if (rdata) {
- kref_init(&rdata->refcount);
- INIT_LIST_HEAD(&rdata->list);
- init_completion(&rdata->done);
- INIT_WORK(&rdata->work, complete);
- }
-
- return rdata;
-}
-
-void
-cifs_readdata_release(struct kref *refcount)
-{
- struct cifs_readdata *rdata = container_of(refcount,
- struct cifs_readdata, refcount);
-
- if (rdata->ctx)
- kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (rdata->mr) {
- smbd_deregister_mr(rdata->mr);
- rdata->mr = NULL;
- }
-#endif
- if (rdata->cfile)
- cifsFileInfo_put(rdata->cfile);
-
- kfree(rdata);
-}
-
-static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
-
-static void
-cifs_uncached_readv_complete(struct work_struct *work)
-{
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
-
- complete(&rdata->done);
- collect_uncached_read_data(rdata->ctx);
- /* the below call can possibly free the last ref to aio ctx */
- kref_put(&rdata->refcount, cifs_readdata_release);
-}
-
-static int cifs_resend_rdata(struct cifs_readdata *rdata,
- struct list_head *rdata_list,
- struct cifs_aio_ctx *ctx)
+ssize_t cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- unsigned int rsize;
- struct cifs_credits credits;
- int rc;
- struct TCP_Server_Info *server;
-
- /* XXX: should we pick a new channel here? */
- server = rdata->server;
-
- do {
- if (rdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(rdata->cfile, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- /*
- * Wait for credits to resend this rdata.
- * Note: we are attempting to resend the whole rdata not in
- * segments
- */
- do {
- rc = server->ops->wait_mtu_credits(server, rdata->bytes,
- &rsize, &credits);
-
- if (rc)
- goto fail;
-
- if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (rsize < rdata->bytes);
- rdata->credits = credits;
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else {
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (rdata->mr) {
- rdata->mr->need_invalidate = true;
- smbd_deregister_mr(rdata->mr);
- rdata->mr = NULL;
- }
-#endif
- rc = server->ops->async_readv(rdata);
- }
- }
-
- /* If the read was successfully sent, we are done */
- if (!rc) {
- /* Add to aio pending list */
- list_add_tail(&rdata->list, rdata_list);
- return 0;
- }
-
- /* Roll back credits and retry if needed */
- add_credits_and_wake_if(server, &rdata->credits, 0);
- } while (rc == -EAGAIN);
-
-fail:
- kref_put(&rdata->refcount, cifs_readdata_release);
- return rc;
-}
-
-static int
-cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
- struct cifs_aio_ctx *ctx)
-{
- struct cifs_readdata *rdata;
- unsigned int rsize, nsegs, max_segs = INT_MAX;
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- size_t cur_len, max_len;
- int rc;
- pid_t pid;
- struct TCP_Server_Info *server;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
-
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- max_segs = server->smbd_conn->max_frmr_depth;
-#endif
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- do {
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
- break;
- }
-
- if (cifs_sb->ctx->rsize == 0)
- cifs_sb->ctx->rsize =
- server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
- cifs_sb->ctx);
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
- &rsize, credits);
- if (rc)
- break;
-
- max_len = min_t(size_t, len, rsize);
-
- cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len,
- max_segs, &nsegs);
- cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n",
- cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs);
- if (cur_len == 0) {
- rc = -EIO;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata = cifs_readdata_alloc(cifs_uncached_readv_complete);
- if (!rdata) {
- add_credits_and_wake_if(server, credits, 0);
- rc = -ENOMEM;
- break;
- }
-
- rdata->server = server;
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->offset = fpos;
- rdata->bytes = cur_len;
- rdata->pid = pid;
- rdata->credits = credits_on_stack;
- rdata->ctx = ctx;
- kref_get(&ctx->refcount);
-
- rdata->iter = ctx->iter;
- iov_iter_truncate(&rdata->iter, cur_len);
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
-
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_readv(rdata);
- }
+ ssize_t rc;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (rc) {
- add_credits_and_wake_if(server, &rdata->credits, 0);
- kref_put(&rdata->refcount, cifs_readdata_release);
- if (rc == -EAGAIN)
- continue;
- break;
- }
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
- list_add_tail(&rdata->list, rdata_list);
- iov_iter_advance(&ctx->iter, cur_len);
- fpos += cur_len;
- len -= cur_len;
- } while (len > 0);
+ rc = cifs_revalidate_mapping(inode);
+ if (rc)
+ return rc;
- return rc;
+ return netfs_file_read_iter(iocb, iter);
}
-static void
-collect_uncached_read_data(struct cifs_aio_ctx *ctx)
+ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct cifs_readdata *rdata, *tmp;
- struct cifs_sb_info *cifs_sb;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ ssize_t written;
int rc;
- cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
-
- mutex_lock(&ctx->aio_mutex);
-
- if (list_empty(&ctx->list)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- rc = ctx->rc;
- /* the loop below should proceed in the order of increasing offsets */
-again:
- list_for_each_entry_safe(rdata, tmp, &ctx->list, list) {
- if (!rc) {
- if (!try_wait_for_completion(&rdata->done)) {
- mutex_unlock(&ctx->aio_mutex);
- return;
- }
-
- if (rdata->result == -EAGAIN) {
- /* resend call if it's a retryable error */
- struct list_head tmp_list;
- unsigned int got_bytes = rdata->got_bytes;
-
- list_del_init(&rdata->list);
- INIT_LIST_HEAD(&tmp_list);
-
- if (ctx->direct_io) {
- /*
- * Re-use rdata as this is a
- * direct I/O
- */
- rc = cifs_resend_rdata(
- rdata,
- &tmp_list, ctx);
- } else {
- rc = cifs_send_async_read(
- rdata->offset + got_bytes,
- rdata->bytes - got_bytes,
- rdata->cfile, cifs_sb,
- &tmp_list, ctx);
-
- kref_put(&rdata->refcount,
- cifs_readdata_release);
- }
-
- list_splice(&tmp_list, &ctx->list);
-
- goto again;
- } else if (rdata->result)
- rc = rdata->result;
-
- /* if there was a short read -- discard anything left */
- if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
- rc = -ENODATA;
-
- ctx->total_len += rdata->got_bytes;
- }
- list_del_init(&rdata->list);
- kref_put(&rdata->refcount, cifs_readdata_release);
- }
-
- /* mask nodata case */
- if (rc == -ENODATA)
- rc = 0;
-
- ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
- else
- complete(&ctx->done);
-}
-
-static ssize_t __cifs_readv(
- struct kiocb *iocb, struct iov_iter *to, bool direct)
-{
- size_t len;
- struct file *file = iocb->ki_filp;
- struct cifs_sb_info *cifs_sb;
- struct cifsFileInfo *cfile;
- struct cifs_tcon *tcon;
- ssize_t rc, total_read = 0;
- loff_t offset = iocb->ki_pos;
- struct cifs_aio_ctx *ctx;
-
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- cifs_sb = CIFS_FILE_SB(file);
- cfile = file->private_data;
- tcon = tlink_tcon(cfile->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
-
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
- ctx = cifs_aio_ctx_alloc();
- if (!ctx)
- return -ENOMEM;
-
- ctx->pos = offset;
- ctx->direct_io = direct;
- ctx->len = len;
- ctx->cfile = cifsFileInfo_get(cfile);
- ctx->nr_pinned_pages = 0;
-
- if (!is_sync_kiocb(iocb))
- ctx->iocb = iocb;
-
- if (user_backed_iter(to)) {
- /*
- * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
- * they contain references to the calling process's virtual
- * memory layout which won't be available in an async worker
- * thread. This also takes a pin on every folio involved.
- */
- rc = netfs_extract_user_iter(to, iov_iter_count(to),
- &ctx->iter, 0);
- if (rc < 0) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- ctx->nr_pinned_pages = rc;
- ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(to);
- ctx->should_dirty = true;
- } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) &&
- !is_sync_kiocb(iocb)) {
- /*
- * If the op is asynchronous, we need to copy the list attached
- * to a BVEC/KVEC-type iterator, but we assume that the storage
- * will be retained by the caller; in any case, we may or may
- * not be able to pin the pages, so we don't try.
- */
- ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL);
- if (!ctx->bv) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -ENOMEM;
- }
- } else {
- /*
- * Otherwise, we just pass the iterator down as-is and rely on
- * the caller to make sure the pages referred to by the
- * iterator don't evaporate.
- */
- ctx->iter = *to;
- }
-
- if (direct) {
- rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
- offset, offset + len - 1);
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EAGAIN;
+ if (iocb->ki_filp->f_flags & O_DIRECT) {
+ written = netfs_unbuffered_write_iter(iocb, from);
+ if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ cifs_zap_mapping(inode);
+ cifs_dbg(FYI,
+ "Set no oplock for inode=%p after a write operation\n",
+ inode);
+ cinode->oplock = 0;
}
+ return written;
}
- /* grab a lock here due to read response handlers can access ctx */
- mutex_lock(&ctx->aio_mutex);
-
- rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx);
-
- /* if at least one read request send succeeded, then reset rc */
- if (!list_empty(&ctx->list))
- rc = 0;
-
- mutex_unlock(&ctx->aio_mutex);
-
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
- }
-
- if (!is_sync_kiocb(iocb)) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return -EIOCBQUEUED;
- }
-
- rc = wait_for_completion_killable(&ctx->done);
- if (rc) {
- mutex_lock(&ctx->aio_mutex);
- ctx->rc = rc = -EINTR;
- total_read = ctx->total_len;
- mutex_unlock(&ctx->aio_mutex);
- } else {
- rc = ctx->rc;
- total_read = ctx->total_len;
- }
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ written = netfs_file_write_iter(iocb, from);
- if (total_read) {
- iocb->ki_pos += total_read;
- return total_read;
+ if (!CIFS_CACHE_WRITE(CIFS_I(inode))) {
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (rc)
+ cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
+ rc, inode);
}
- return rc;
-}
-
-ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
-{
- return __cifs_readv(iocb, to, true);
-}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
-{
- return __cifs_readv(iocb, to, false);
+ cifs_put_writer(cinode);
+ return written;
}
ssize_t
@@ -4386,12 +2902,15 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
* pos+len-1.
*/
if (!CIFS_CACHE_READ(cinode))
- return cifs_user_readv(iocb, to);
+ return netfs_unbuffered_read_iter(iocb, to);
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_read_iter(iocb, to);
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
+ return netfs_buffered_read_iter(iocb, to);
+ }
/*
* We need to hold the sem to be sure nobody modifies lock list
@@ -4400,126 +2919,19 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
- 0, NULL, CIFS_READ_OP))
- rc = generic_file_read_iter(iocb, to);
+ 0, NULL, CIFS_READ_OP)) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ rc = netfs_unbuffered_read_iter(iocb, to);
+ else
+ rc = netfs_buffered_read_iter(iocb, to);
+ }
up_read(&cinode->lock_sem);
return rc;
}
-static ssize_t
-cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
-{
- int rc = -EACCES;
- unsigned int bytes_read = 0;
- unsigned int total_read;
- unsigned int current_read_size;
- unsigned int rsize;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- unsigned int xid;
- char *cur_offset;
- struct cifsFileInfo *open_file;
- struct cifs_io_parms io_parms = {0};
- int buf_type = CIFS_NO_BUFFER;
- __u32 pid;
-
- xid = get_xid();
- cifs_sb = CIFS_FILE_SB(file);
-
- /* FIXME: set up handlers for larger reads and/or convert to async */
- rsize = min_t(unsigned int, cifs_sb->ctx->rsize, CIFSMaxBufSize);
-
- if (file->private_data == NULL) {
- rc = -EBADF;
- free_xid(xid);
- return rc;
- }
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
- server = cifs_pick_channel(tcon->ses);
-
- if (!server->ops->sync_read) {
- free_xid(xid);
- return -ENOSYS;
- }
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
- for (total_read = 0, cur_offset = read_data; read_size > total_read;
- total_read += bytes_read, cur_offset += bytes_read) {
- do {
- current_read_size = min_t(uint, read_size - total_read,
- rsize);
- /*
- * For windows me and 9x we do not want to request more
- * than it negotiated since it will refuse the read
- * then.
- */
- if (!(tcon->ses->capabilities &
- tcon->ses->server->vals->cap_large_files)) {
- current_read_size = min_t(uint,
- current_read_size, CIFSMaxBufSize);
- }
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc != 0)
- break;
- }
- io_parms.pid = pid;
- io_parms.tcon = tcon;
- io_parms.offset = *offset;
- io_parms.length = current_read_size;
- io_parms.server = server;
- rc = server->ops->sync_read(xid, &open_file->fid, &io_parms,
- &bytes_read, &cur_offset,
- &buf_type);
- } while (rc == -EAGAIN);
-
- if (rc || (bytes_read == 0)) {
- if (total_read) {
- break;
- } else {
- free_xid(xid);
- return rc;
- }
- } else {
- cifs_stats_bytes_read(tcon, total_read);
- *offset += bytes_read;
- }
- }
- free_xid(xid);
- return total_read;
-}
-
-/*
- * If the page is mmap'ed into a process' page tables, then we need to make
- * sure that it doesn't change while being written back.
- */
static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
-
- /* Wait for the folio to be written to the cache before we allow it to
- * be modified. We then assume the entire folio will need writing back.
- */
-#ifdef CONFIG_CIFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_RETRY;
-#endif
-
- folio_wait_writeback(folio);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- return VM_FAULT_LOCKED;
+ return netfs_page_mkwrite(vmf, NULL);
}
static const struct vm_operations_struct cifs_file_vm_ops = {
@@ -4565,290 +2977,6 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
-/*
- * Unlock a bunch of folios in the pagecache.
- */
-static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last)
-{
- struct folio *folio;
- XA_STATE(xas, &mapping->i_pages, first);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last) {
- folio_unlock(folio);
- }
- rcu_read_unlock();
-}
-
-static void cifs_readahead_complete(struct work_struct *work)
-{
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
- struct folio *folio;
- pgoff_t last;
- bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes);
-
- XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE);
-
- if (good)
- cifs_readahead_to_fscache(rdata->mapping->host,
- rdata->offset, rdata->bytes);
-
- if (iov_iter_count(&rdata->iter) > 0)
- iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter);
-
- last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE;
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last) {
- if (good) {
- flush_dcache_folio(folio);
- folio_mark_uptodate(folio);
- }
- folio_unlock(folio);
- }
- rcu_read_unlock();
-
- kref_put(&rdata->refcount, cifs_readdata_release);
-}
-
-static void cifs_readahead(struct readahead_control *ractl)
-{
- struct cifsFileInfo *open_file = ractl->file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file);
- struct TCP_Server_Info *server;
- unsigned int xid, nr_pages, cache_nr_pages = 0;
- unsigned int ra_pages;
- pgoff_t next_cached = ULONG_MAX, ra_index;
- bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) &&
- cifs_inode_cookie(ractl->mapping->host)->cache_priv;
- bool check_cache = caching;
- pid_t pid;
- int rc = 0;
-
- /* Note that readahead_count() lags behind our dequeuing of pages from
- * the ractl, wo we have to keep track for ourselves.
- */
- ra_pages = readahead_count(ractl);
- ra_index = readahead_index(ractl);
-
- xid = get_xid();
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = open_file->pid;
- else
- pid = current->tgid;
-
- server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
-
- cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
- __func__, ractl->file, ractl->mapping, ra_pages);
-
- /*
- * Chop the readahead request up into rsize-sized read requests.
- */
- while ((nr_pages = ra_pages)) {
- unsigned int i, rsize;
- struct cifs_readdata *rdata;
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- struct folio *folio;
- pgoff_t fsize;
-
- /*
- * Find out if we have anything cached in the range of
- * interest, and if so, where the next chunk of cached data is.
- */
- if (caching) {
- if (check_cache) {
- rc = cifs_fscache_query_occupancy(
- ractl->mapping->host, ra_index, nr_pages,
- &next_cached, &cache_nr_pages);
- if (rc < 0)
- caching = false;
- check_cache = false;
- }
-
- if (ra_index == next_cached) {
- /*
- * TODO: Send a whole batch of pages to be read
- * by the cache.
- */
- folio = readahead_folio(ractl);
- fsize = folio_nr_pages(folio);
- ra_pages -= fsize;
- ra_index += fsize;
- if (cifs_readpage_from_fscache(ractl->mapping->host,
- &folio->page) < 0) {
- /*
- * TODO: Deal with cache read failure
- * here, but for the moment, delegate
- * that to readpage.
- */
- caching = false;
- }
- folio_unlock(folio);
- next_cached += fsize;
- cache_nr_pages -= fsize;
- if (cache_nr_pages == 0)
- check_cache = true;
- continue;
- }
- }
-
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc) {
- if (rc == -EAGAIN)
- continue;
- break;
- }
- }
-
- if (cifs_sb->ctx->rsize == 0)
- cifs_sb->ctx->rsize =
- server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
- cifs_sb->ctx);
-
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
- &rsize, credits);
- if (rc)
- break;
- nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages);
- if (next_cached != ULONG_MAX)
- nr_pages = min_t(size_t, nr_pages, next_cached - ra_index);
-
- /*
- * Give up immediately if rsize is too small to read an entire
- * page. The VFS will fall back to readpage. We should never
- * reach this point however since we set ra_pages to 0 when the
- * rsize is smaller than a cache page.
- */
- if (unlikely(!nr_pages)) {
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata = cifs_readdata_alloc(cifs_readahead_complete);
- if (!rdata) {
- /* best to give up if we're out of mem */
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rdata->offset = ra_index * PAGE_SIZE;
- rdata->bytes = nr_pages * PAGE_SIZE;
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->server = server;
- rdata->mapping = ractl->mapping;
- rdata->pid = pid;
- rdata->credits = credits_on_stack;
-
- for (i = 0; i < nr_pages; i++) {
- if (!readahead_folio(ractl))
- WARN_ON(1);
- }
- ra_pages -= nr_pages;
- ra_index += nr_pages;
-
- iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages,
- rdata->offset, rdata->bytes);
-
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
- if (!rc) {
- if (rdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = server->ops->async_readv(rdata);
- }
-
- if (rc) {
- add_credits_and_wake_if(server, &rdata->credits, 0);
- cifs_unlock_folios(rdata->mapping,
- rdata->offset / PAGE_SIZE,
- (rdata->offset + rdata->bytes - 1) / PAGE_SIZE);
- /* Fallback to the readpage in error/reconnect cases */
- kref_put(&rdata->refcount, cifs_readdata_release);
- break;
- }
-
- kref_put(&rdata->refcount, cifs_readdata_release);
- }
-
- free_xid(xid);
-}
-
-/*
- * cifs_readpage_worker must be called with the page pinned
- */
-static int cifs_readpage_worker(struct file *file, struct page *page,
- loff_t *poffset)
-{
- struct inode *inode = file_inode(file);
- struct timespec64 atime, mtime;
- char *read_data;
- int rc;
-
- /* Is the page cached? */
- rc = cifs_readpage_from_fscache(inode, page);
- if (rc == 0)
- goto read_complete;
-
- read_data = kmap(page);
- /* for reads over a certain size could initiate async read ahead */
-
- rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
-
- if (rc < 0)
- goto io_error;
- else
- cifs_dbg(FYI, "Bytes read %d\n", rc);
-
- /* we do not want atime to be less than mtime, it broke some apps */
- atime = inode_set_atime_to_ts(inode, current_time(inode));
- mtime = inode_get_mtime(inode);
- if (timespec64_compare(&atime, &mtime) < 0)
- inode_set_atime_to_ts(inode, inode_get_mtime(inode));
-
- if (PAGE_SIZE > rc)
- memset(read_data + rc, 0, PAGE_SIZE - rc);
-
- flush_dcache_page(page);
- SetPageUptodate(page);
- rc = 0;
-
-io_error:
- kunmap(page);
-
-read_complete:
- unlock_page(page);
- return rc;
-}
-
-static int cifs_read_folio(struct file *file, struct folio *folio)
-{
- struct page *page = &folio->page;
- loff_t offset = page_file_offset(page);
- int rc = -EACCES;
- unsigned int xid;
-
- xid = get_xid();
-
- if (file->private_data == NULL) {
- rc = -EBADF;
- free_xid(xid);
- return rc;
- }
-
- cifs_dbg(FYI, "read_folio %p at offset %d 0x%x\n",
- page, (int)offset, (int)offset);
-
- rc = cifs_readpage_worker(file, page, &offset);
-
- free_xid(xid);
- return rc;
-}
-
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
@@ -4896,123 +3024,6 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file,
return true;
}
-static int cifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
-{
- int oncethru = 0;
- pgoff_t index = pos >> PAGE_SHIFT;
- loff_t offset = pos & (PAGE_SIZE - 1);
- loff_t page_start = pos & PAGE_MASK;
- loff_t i_size;
- struct page *page;
- int rc = 0;
-
- cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
-
-start:
- page = grab_cache_page_write_begin(mapping, index);
- if (!page) {
- rc = -ENOMEM;
- goto out;
- }
-
- if (PageUptodate(page))
- goto out;
-
- /*
- * If we write a full page it will be up to date, no need to read from
- * the server. If the write is short, we'll end up doing a sync write
- * instead.
- */
- if (len == PAGE_SIZE)
- goto out;
-
- /*
- * optimize away the read when we have an oplock, and we're not
- * expecting to use any of the data we'd be reading in. That
- * is, when the page lies beyond the EOF, or straddles the EOF
- * and the write will cover all of the existing data.
- */
- if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
- i_size = i_size_read(mapping->host);
- if (page_start >= i_size ||
- (offset == 0 && (pos + len) >= i_size)) {
- zero_user_segments(page, 0, offset,
- offset + len,
- PAGE_SIZE);
- /*
- * PageChecked means that the parts of the page
- * to which we're not writing are considered up
- * to date. Once the data is copied to the
- * page, it can be set uptodate.
- */
- SetPageChecked(page);
- goto out;
- }
- }
-
- if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
- /*
- * might as well read a page, it is fast enough. If we get
- * an error, we don't need to return it. cifs_write_end will
- * do a sync write instead since PG_uptodate isn't set.
- */
- cifs_readpage_worker(file, page, &page_start);
- put_page(page);
- oncethru = 1;
- goto start;
- } else {
- /* we could try using another file handle if there is one -
- but how would we lock it to prevent close of that handle
- racing with this read? In any case
- this will be written out by write_end so is fine */
- }
-out:
- *pagep = page;
- return rc;
-}
-
-static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return 0;
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(cifs_inode_cookie(folio->mapping->host));
- return true;
-}
-
-static void cifs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-static int cifs_launder_folio(struct folio *folio)
-{
- int rc = 0;
- loff_t range_start = folio_pos(folio);
- loff_t range_end = range_start + folio_size(folio);
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0,
- .range_start = range_start,
- .range_end = range_end,
- };
-
- cifs_dbg(FYI, "Launder page: %lu\n", folio->index);
-
- if (folio_clear_dirty_for_io(folio))
- rc = cifs_writepage_locked(&folio->page, &wbc);
-
- folio_wait_fscache(folio);
- return rc;
-}
-
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -5102,25 +3113,6 @@ out:
cifs_done_oplock_break(cinode);
}
-/*
- * The presence of cifs_direct_io() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
- * so this method should never be called.
- *
- * Direct IO is not yet supported in the cached mode.
- */
-static ssize_t
-cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
-{
- /*
- * FIXME
- * Eventually need to support direct IO for non forcedirectio mounts
- */
- return -EINVAL;
-}
-
static int cifs_swap_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
{
@@ -5182,22 +3174,19 @@ static void cifs_swap_deactivate(struct file *file)
}
const struct address_space_operations cifs_addr_ops = {
- .read_folio = cifs_read_folio,
- .readahead = cifs_readahead,
- .writepages = cifs_writepages,
- .write_begin = cifs_write_begin,
- .write_end = cifs_write_end,
- .dirty_folio = netfs_dirty_folio,
- .release_folio = cifs_release_folio,
- .direct_IO = cifs_direct_io,
- .invalidate_folio = cifs_invalidate_folio,
- .launder_folio = cifs_launder_folio,
- .migrate_folio = filemap_migrate_folio,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .writepages = netfs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .direct_IO = noop_direct_IO,
+ .invalidate_folio = netfs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
/*
* TODO: investigate and if useful we could add an is_dirty_writeback
* helper if needed
*/
- .swap_activate = cifs_swap_activate,
+ .swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
};
@@ -5207,13 +3196,10 @@ const struct address_space_operations cifs_addr_ops = {
* to leave cifs_readahead out of the address space operations.
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
- .read_folio = cifs_read_folio,
- .writepages = cifs_writepages,
- .write_begin = cifs_write_begin,
- .write_end = cifs_write_end,
- .dirty_folio = netfs_dirty_folio,
- .release_folio = cifs_release_folio,
- .invalidate_folio = cifs_invalidate_folio,
- .launder_folio = cifs_launder_folio,
- .migrate_folio = filemap_migrate_folio,
+ .read_folio = netfs_read_folio,
+ .writepages = netfs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index 1a895e6243ee..01424a5cdb99 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -170,112 +170,3 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
cifsi->netfs.cache = NULL;
}
}
-
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- struct iov_iter iter;
- struct bio_vec bvec;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- bvec_set_page(&bvec, page, PAGE_SIZE, 0);
- iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE);
-
- ret = fscache_begin_read_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
- NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
-}
-
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len,
- bool no_space_allocated_yet)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- struct iov_iter iter;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len);
-
- ret = fscache_begin_write_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
- no_space_allocated_yet);
- if (ret == 0)
- ret = fscache_write(&cres, start, &iter, NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
-}
-
-/*
- * Retrieve a page from FS-Cache
- */
-int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- int ret;
-
- cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
- __func__, cifs_inode_cookie(inode), page, inode);
-
- ret = fscache_fallback_read_page(inode, page);
- if (ret < 0)
- return ret;
-
- /* Read completed synchronously */
- SetPageUptodate(page);
- return 0;
-}
-
-void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len)
-{
- cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n",
- __func__, cifs_inode_cookie(inode), pos, len, inode);
-
- fscache_fallback_write_pages(inode, pos, len, true);
-}
-
-/*
- * Query the cache occupancy.
- */
-int __cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- loff_t start, data_start;
- size_t len, data_len;
- int ret;
-
- ret = fscache_begin_read_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- start = first * PAGE_SIZE;
- len = nr_pages * PAGE_SIZE;
- ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE,
- &data_start, &data_len);
- if (ret == 0) {
- *_data_first = data_start / PAGE_SIZE;
- *_data_nr_pages = len / PAGE_SIZE;
- }
-
- fscache_end_operation(&cres);
- return ret;
-}
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 1f2ea9f5cc9a..f06cb24f5f3c 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -74,41 +74,6 @@ static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags
i_size_read(inode), flags);
}
-extern int __cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages);
-
-static inline int cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- if (!cifs_inode_cookie(inode))
- return -ENOBUFS;
- return __cifs_fscache_query_occupancy(inode, first, nr_pages,
- _data_first, _data_nr_pages);
-}
-
-extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage);
-extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len);
-
-
-static inline int cifs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- if (cifs_inode_cookie(inode))
- return __cifs_readpage_from_fscache(inode, page);
- return -ENOBUFS;
-}
-
-static inline void cifs_readahead_to_fscache(struct inode *inode,
- loff_t pos, size_t len)
-{
- if (cifs_inode_cookie(inode))
- __cifs_readahead_to_fscache(inode, pos, len);
-}
-
static inline bool cifs_fscache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(cifs_inode_cookie(inode));
@@ -131,25 +96,6 @@ static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { re
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
static inline bool cifs_fscache_enabled(struct inode *inode) { return false; }
-static inline int cifs_fscache_query_occupancy(struct inode *inode,
- pgoff_t first, unsigned int nr_pages,
- pgoff_t *_data_first,
- unsigned int *_data_nr_pages)
-{
- *_data_first = ULONG_MAX;
- *_data_nr_pages = 0;
- return -ENOBUFS;
-}
-
-static inline int
-cifs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline
-void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {}
-
#endif /* CONFIG_CIFS_FSCACHE */
#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 60afab5c83d4..e8bfeea23660 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -28,14 +28,29 @@
#include "cached_dir.h"
#include "reparse.h"
+/*
+ * Set parameters for the netfs library
+ */
+static void cifs_set_netfs_context(struct inode *inode)
+{
+ struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+ netfs_inode_init(&cifs_i->netfs, &cifs_req_ops, true);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+ __set_bit(NETFS_ICTX_WRITETHROUGH, &cifs_i->netfs.flags);
+}
+
static void cifs_set_ops(struct inode *inode)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct netfs_inode *ictx = netfs_inode(inode);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &cifs_file_inode_ops;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+ set_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
inode->i_fop = &cifs_file_direct_nobrl_ops;
else
@@ -57,6 +72,7 @@ static void cifs_set_ops(struct inode *inode)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
else
inode->i_data.a_ops = &cifs_addr_ops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case S_IFDIR:
if (IS_AUTOMOUNT(inode)) {
@@ -221,8 +237,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
if (fattr->cf_flags & CIFS_FATTR_JUNCTION)
inode->i_flags |= S_AUTOMOUNT;
- if (inode->i_state & I_NEW)
+ if (inode->i_state & I_NEW) {
+ cifs_set_netfs_context(inode);
cifs_set_ops(inode);
+ }
return 0;
}
@@ -2431,24 +2449,6 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return false;
}
-/*
- * Zap the cache. Called when invalid_mapping flag is set.
- */
-int
-cifs_invalidate_mapping(struct inode *inode)
-{
- int rc = 0;
-
- if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
- rc = invalidate_inode_pages2(inode->i_mapping);
- if (rc)
- cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
- __func__, inode, rc);
- }
-
- return rc;
-}
-
/**
* cifs_wait_bit_killable - helper for functions that are sleeping on bit locks
*
@@ -2485,9 +2485,12 @@ cifs_revalidate_mapping(struct inode *inode)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
goto skip_invalidate;
- rc = cifs_invalidate_mapping(inode);
- if (rc)
+ rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX);
+ if (rc) {
+ cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
+ __func__, inode, rc);
set_bit(CIFS_INO_INVALID_MAPPING, flags);
+ }
}
skip_invalidate:
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 28f0b7d19d53..ef18cd30f66c 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -217,8 +217,8 @@ smb2_get_credits(struct mid_q_entry *mid)
}
static int
-smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, struct cifs_credits *credits)
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
+ size_t *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits, in_flight;
@@ -4490,7 +4490,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_off;
unsigned int cur_page_idx;
unsigned int pad_len;
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
int length;
bool use_rdma_mr = false;
@@ -4592,7 +4592,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
/* Copy the data to the output I/O iterator. */
rdata->result = cifs_copy_pages_to_iter(pages, pages_len,
- cur_off, &rdata->iter);
+ cur_off, &rdata->subreq.io_iter);
if (rdata->result != 0) {
if (is_offloaded)
mid->mid_state = MID_RESPONSE_MALFORMED;
@@ -4606,7 +4606,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
/* read response payload is in buf */
WARN_ONCE(pages && !xa_empty(pages),
"read data can be either in buf or in pages");
- length = copy_to_iter(buf + data_offset, data_len, &rdata->iter);
+ length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);
if (length < 0)
return length;
rdata->got_bytes = data_len;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index a5efce03cb58..993ac36c3d58 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -23,6 +23,8 @@
#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -4391,7 +4393,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms)
*/
static int
smb2_new_read_req(void **buf, unsigned int *total_len,
- struct cifs_io_parms *io_parms, struct cifs_readdata *rdata,
+ struct cifs_io_parms *io_parms, struct cifs_io_subrequest *rdata,
unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
@@ -4419,10 +4421,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
- trace_smb3_read_enter(0 /* xid */,
- io_parms->persistent_fid,
- io_parms->tcon->tid, io_parms->tcon->ses->Suid,
- io_parms->offset, io_parms->length);
+ trace_smb3_read_enter(rdata ? rdata->rreq->debug_id : 0,
+ rdata ? rdata->subreq.debug_index : 0,
+ rdata ? rdata->xid : 0,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -4432,7 +4436,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
struct smbd_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter,
+ rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter,
true, need_invalidate);
if (!rdata->mr)
return -EAGAIN;
@@ -4483,8 +4487,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
static void
smb2_readv_callback(struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
struct smb2_hdr *shdr =
(struct smb2_hdr *)rdata->iov[0].iov_base;
@@ -4492,17 +4496,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1 };
if (rdata->got_bytes) {
- rqst.rq_iter = rdata->iter;
- rqst.rq_iter_size = iov_iter_count(&rdata->iter);
+ rqst.rq_iter = rdata->subreq.io_iter;
+ rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter);
}
WARN_ONCE(rdata->server != mid->server,
"rdata server %p != mid server %p",
rdata->server, mid->server);
- cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
- rdata->bytes);
+ rdata->subreq.len);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -4512,7 +4516,6 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (server->sign && !mid->decrypted) {
int rc;
- iov_iter_revert(&rqst.rq_iter, rdata->got_bytes);
iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes);
rc = smb2_verify_signature(&rqst, server);
if (rc)
@@ -4553,24 +4556,40 @@ smb2_readv_callback(struct mid_q_entry *mid)
#endif
if (rdata->result && rdata->result != -ENODATA) {
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
- trace_smb3_read_err(0 /* xid */,
- rdata->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, rdata->offset,
- rdata->bytes, rdata->result);
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, rdata->subreq.start,
+ rdata->subreq.len, rdata->result);
} else
- trace_smb3_read_done(0 /* xid */,
- rdata->cfile->fid.persistent_fid,
+ trace_smb3_read_done(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
- rdata->offset, rdata->got_bytes);
+ rdata->subreq.start, rdata->got_bytes);
- queue_work(cifsiod_wq, &rdata->work);
+ if (rdata->result == -ENODATA) {
+ /* We may have got an EOF error because fallocate
+ * failed to enlarge the file.
+ */
+ if (rdata->subreq.start < rdata->subreq.rreq->i_size)
+ rdata->result = 0;
+ }
+ if (rdata->result == 0 || rdata->result == -EAGAIN)
+ iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
+ rdata->credits.value = 0;
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result, true);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
int
-smb2_async_readv(struct cifs_readdata *rdata)
+smb2_async_readv(struct cifs_io_subrequest *rdata)
{
int rc, flags = 0;
char *buf;
@@ -4579,22 +4598,22 @@ smb2_async_readv(struct cifs_readdata *rdata)
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1 };
struct TCP_Server_Info *server;
- struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
unsigned int total_len;
int credit_request;
- cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
- __func__, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
+ __func__, rdata->subreq.start, rdata->subreq.len);
if (!rdata->server)
rdata->server = cifs_pick_channel(tcon->ses);
- io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
+ io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink);
io_parms.server = server = rdata->server;
- io_parms.offset = rdata->offset;
- io_parms.length = rdata->bytes;
- io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
- io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
+ io_parms.offset = rdata->subreq.start;
+ io_parms.length = rdata->subreq.len;
+ io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
+ io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
io_parms.pid = rdata->pid;
rc = smb2_new_read_req(
@@ -4611,7 +4630,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@@ -4621,22 +4640,22 @@ smb2_async_readv(struct cifs_readdata *rdata)
min_t(int, server->max_credits -
server->credits, credit_request));
- rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ rc = adjust_credits(server, &rdata->credits, rdata->subreq.len);
if (rc)
goto async_readv_out;
flags |= CIFS_HAS_CREDITS;
}
- kref_get(&rdata->refcount);
rc = cifs_call_async(server, &rqst,
cifs_readv_receive, smb2_readv_callback,
smb3_handle_read_data, rdata, flags,
&rdata->credits);
if (rc) {
- kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid, io_parms.persistent_fid,
io_parms.tcon->tid,
io_parms.tcon->ses->Suid,
io_parms.offset, io_parms.length, rc);
@@ -4687,22 +4706,23 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- trace_smb3_read_err(xid,
+ trace_smb3_read_err(0, 0, xid,
req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid,
+ trace_smb3_read_done(0, 0, xid,
+ req->PersistentFileId, io_parms->tcon->tid,
ses->Suid, io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
- trace_smb3_read_done(xid,
- req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, io_parms->length);
+ trace_smb3_read_done(0, 0, xid,
+ req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
cifs_small_buf_release(req);
@@ -4735,12 +4755,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
static void
smb2_writev_callback(struct mid_q_entry *mid)
{
- struct cifs_writedata *wdata = mid->callback_data;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_io_subrequest *wdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
- unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
struct cifs_credits credits = { .value = 0, .instance = 0 };
+ ssize_t result = 0;
+ size_t written;
WARN_ONCE(wdata->server != mid->server,
"wdata server %p != mid server %p",
@@ -4750,8 +4771,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RESPONSE_RECEIVED:
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- wdata->result = smb2_check_receive(mid, server, 0);
- if (wdata->result != 0)
+ result = smb2_check_receive(mid, server, 0);
+ if (result != 0)
break;
written = le32_to_cpu(rsp->DataLength);
@@ -4761,24 +4782,25 @@ smb2_writev_callback(struct mid_q_entry *mid)
* client. OS/2 servers are known to set incorrect
* CountHigh values.
*/
- if (written > wdata->bytes)
+ if (written > wdata->subreq.len)
written &= 0xFFFF;
- if (written < wdata->bytes)
+ if (written < wdata->subreq.len)
wdata->result = -ENOSPC;
else
- wdata->bytes = written;
+ wdata->subreq.len = written;
+ iov_iter_advance(&wdata->subreq.io_iter, written);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
- wdata->result = -EAGAIN;
+ result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
fallthrough;
default:
- wdata->result = -EIO;
+ result = -EIO;
break;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
@@ -4794,44 +4816,44 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->mr = NULL;
}
#endif
- if (wdata->result) {
+ if (result) {
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- trace_smb3_write_err(0 /* no xid */,
- wdata->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, wdata->offset,
- wdata->bytes, wdata->result);
+ trace_smb3_write_err(wdata->xid,
+ wdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->subreq.start,
+ wdata->subreq.len, wdata->result);
if (wdata->result == -ENOSPC)
pr_warn_once("Out of space writing to %s\n",
tcon->tree_name);
} else
trace_smb3_write_done(0 /* no xid */,
- wdata->cfile->fid.persistent_fid,
+ wdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
- wdata->offset, wdata->bytes);
+ wdata->subreq.start, wdata->subreq.len);
- queue_work(cifsiod_wq, &wdata->work);
+ wdata->credits.value = 0;
+ cifs_write_subrequest_terminated(wdata, result ?: written, true);
release_mid(mid);
add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
-int
-smb2_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref))
+void
+smb2_async_writev(struct cifs_io_subrequest *wdata)
{
int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
struct smb2_hdr *shdr;
- struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct kvec iov[1];
struct smb_rqst rqst = { };
- unsigned int total_len;
+ unsigned int total_len, xid = wdata->xid;
struct cifs_io_parms _io_parms;
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server || wdata->replay)
+ if (!wdata->server || test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
server = wdata->server = cifs_pick_channel(tcon->ses);
/*
@@ -4841,10 +4863,10 @@ smb2_async_writev(struct cifs_writedata *wdata,
_io_parms = (struct cifs_io_parms) {
.tcon = tcon,
.server = server,
- .offset = wdata->offset,
- .length = wdata->bytes,
- .persistent_fid = wdata->cfile->fid.persistent_fid,
- .volatile_fid = wdata->cfile->fid.volatile_fid,
+ .offset = wdata->subreq.start,
+ .length = wdata->subreq.len,
+ .persistent_fid = wdata->req->cfile->fid.persistent_fid,
+ .volatile_fid = wdata->req->cfile->fid.volatile_fid,
.pid = wdata->pid,
};
io_parms = &_io_parms;
@@ -4852,7 +4874,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
rc = smb2_plain_req_init(SMB2_WRITE, tcon, server,
(void **) &req, &total_len);
if (rc)
- return rc;
+ goto out;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -4870,7 +4892,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
- trace_smb3_write_enter(0 /* xid */,
+ trace_smb3_write_enter(wdata->xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
@@ -4884,10 +4906,10 @@ smb2_async_writev(struct cifs_writedata *wdata,
*/
if (smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
- size_t data_size = iov_iter_count(&wdata->iter);
+ size_t data_size = iov_iter_count(&wdata->subreq.io_iter);
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter,
+ wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
false, need_invalidate);
if (!wdata->mr) {
rc = -EAGAIN;
@@ -4914,9 +4936,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rqst.rq_iter = wdata->iter;
+ rqst.rq_iter = wdata->subreq.io_iter;
rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
- if (wdata->replay)
+ if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
smb2_set_replay(server, &rqst);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr)
@@ -4934,7 +4956,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
#endif
if (wdata->credits.value > 0) {
- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@@ -4951,25 +4973,27 @@ smb2_async_writev(struct cifs_writedata *wdata,
flags |= CIFS_HAS_CREDITS;
}
- kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
wdata, flags, &wdata->credits);
-
+ /* Can't touch wdata if rc == 0 */
if (rc) {
- trace_smb3_write_err(0 /* no xid */,
+ trace_smb3_write_err(xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset,
io_parms->length,
rc);
- kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
}
async_writev_out:
cifs_small_buf_release(req);
- return rc;
+out:
+ if (rc) {
+ add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+ cifs_write_subrequest_terminated(wdata, rc, true);
+ }
}
/*
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 732169d8a67a..b208232b12a2 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -210,11 +210,10 @@ extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le64 *uniqueid);
-extern int smb2_async_readv(struct cifs_readdata *rdata);
+extern int smb2_async_readv(struct cifs_io_subrequest *rdata);
extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, char **buf, int *buf_type);
-extern int smb2_async_writev(struct cifs_writedata *wdata,
- void (*release)(struct kref *kref));
+extern void smb2_async_writev(struct cifs_io_subrequest *wdata);
extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, struct kvec *iov, int n_vec);
extern int SMB2_echo(struct TCP_Server_Info *server);
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 604e52876cd2..af97389e983e 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -85,6 +85,62 @@ smb3_tcon_ref_traces;
/* For logging errors in read or write */
DECLARE_EVENT_CLASS(smb3_rw_err_class,
+ TP_PROTO(unsigned int rreq_debug_id,
+ unsigned int rreq_debug_index,
+ unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len,
+ int rc),
+ TP_ARGS(rreq_debug_id, rreq_debug_index,
+ xid, fid, tid, sesid, offset, len, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq_debug_id)
+ __field(unsigned int, rreq_debug_index)
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->rreq_debug_id = rreq_debug_id;
+ __entry->rreq_debug_index = rreq_debug_index;
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->rc = rc;
+ ),
+ TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ __entry->rreq_debug_id, __entry->rreq_debug_index,
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_RW_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+ TP_PROTO(unsigned int rreq_debug_id, \
+ unsigned int rreq_debug_index, \
+ unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len, \
+ int rc), \
+ TP_ARGS(rreq_debug_id, rreq_debug_index, xid, fid, tid, sesid, offset, len, rc))
+
+DEFINE_SMB3_RW_ERR_EVENT(read_err);
+
+/* For logging errors in other file I/O ops */
+DECLARE_EVENT_CLASS(smb3_other_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
__u32 tid,
@@ -116,8 +172,8 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class,
__entry->offset, __entry->len, __entry->rc)
)
-#define DEFINE_SMB3_RW_ERR_EVENT(name) \
-DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+#define DEFINE_SMB3_OTHER_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_other_err_class, smb3_##name, \
TP_PROTO(unsigned int xid, \
__u64 fid, \
__u32 tid, \
@@ -127,15 +183,67 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
int rc), \
TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
-DEFINE_SMB3_RW_ERR_EVENT(write_err);
-DEFINE_SMB3_RW_ERR_EVENT(read_err);
-DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
-DEFINE_SMB3_RW_ERR_EVENT(zero_err);
-DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(write_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(zero_err);
+DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err);
/* For logging successful read or write */
DECLARE_EVENT_CLASS(smb3_rw_done_class,
+ TP_PROTO(unsigned int rreq_debug_id,
+ unsigned int rreq_debug_index,
+ unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len),
+ TP_ARGS(rreq_debug_id, rreq_debug_index,
+ xid, fid, tid, sesid, offset, len),
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq_debug_id)
+ __field(unsigned int, rreq_debug_index)
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ ),
+ TP_fast_assign(
+ __entry->rreq_debug_id = rreq_debug_id;
+ __entry->rreq_debug_index = rreq_debug_index;
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x",
+ __entry->rreq_debug_id, __entry->rreq_debug_index,
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_SMB3_RW_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+ TP_PROTO(unsigned int rreq_debug_id, \
+ unsigned int rreq_debug_index, \
+ unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len), \
+ TP_ARGS(rreq_debug_id, rreq_debug_index, xid, fid, tid, sesid, offset, len))
+
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_done);
+
+/* For logging successful other op */
+DECLARE_EVENT_CLASS(smb3_other_done_class,
TP_PROTO(unsigned int xid,
__u64 fid,
__u32 tid,
@@ -164,8 +272,8 @@ DECLARE_EVENT_CLASS(smb3_rw_done_class,
__entry->offset, __entry->len)
)
-#define DEFINE_SMB3_RW_DONE_EVENT(name) \
-DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+#define DEFINE_SMB3_OTHER_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_other_done_class, smb3_##name, \
TP_PROTO(unsigned int xid, \
__u64 fid, \
__u32 tid, \
@@ -174,16 +282,14 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
-DEFINE_SMB3_RW_DONE_EVENT(write_enter);
-DEFINE_SMB3_RW_DONE_EVENT(read_enter);
-DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
-DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
-DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
-DEFINE_SMB3_RW_DONE_EVENT(write_done);
-DEFINE_SMB3_RW_DONE_EVENT(read_done);
-DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
-DEFINE_SMB3_RW_DONE_EVENT(zero_done);
-DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(write_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(zero_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(falloc_enter);
+DEFINE_SMB3_OTHER_DONE_EVENT(write_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(zero_done);
+DEFINE_SMB3_OTHER_DONE_EVENT(falloc_done);
/* For logging successful set EOF (truncate) */
DECLARE_EVENT_CLASS(smb3_eof_class,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index ddf1a3aafee5..012b9bd06995 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -691,8 +691,8 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
}
int
-cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, struct cifs_credits *credits)
+cifs_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
+ size_t *num, struct cifs_credits *credits)
{
*num = size;
credits->value = 0;
@@ -1692,7 +1692,7 @@ __cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
static int
cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
return __cifs_readv_discard(server, mid, rdata->result);
}
@@ -1702,13 +1702,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length, len;
unsigned int data_offset, data_len;
- struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_io_subrequest *rdata = mid->callback_data;
char *buf = server->smallbuf;
unsigned int buflen = server->pdu_size + HEADER_PREAMBLE_SIZE(server);
bool use_rdma_mr = false;
- cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
- __func__, mid->mid, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%zu\n",
+ __func__, mid->mid, rdata->subreq.start, rdata->subreq.len);
/*
* read the rest of READ_RSP header (sans Data array), or whatever we
@@ -1813,8 +1813,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
length = data_len; /* An RDMA read is already done. */
else
#endif
- length = cifs_read_iter_from_socket(server, &rdata->iter,
+ {
+ length = cifs_read_iter_from_socket(server, &rdata->subreq.io_iter,
data_len);
+ iov_iter_revert(&rdata->subreq.io_iter, data_len);
+ }
if (length > 0)
rdata->got_bytes += length;
server->total_read += length;