summaryrefslogtreecommitdiff
path: root/fs/netfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/netfs')
-rw-r--r--fs/netfs/Makefile3
-rw-r--r--fs/netfs/buffered_read.c40
-rw-r--r--fs/netfs/buffered_write.c829
-rw-r--r--fs/netfs/direct_write.c56
-rw-r--r--fs/netfs/fscache_io.c14
-rw-r--r--fs/netfs/internal.h55
-rw-r--r--fs/netfs/io.c162
-rw-r--r--fs/netfs/main.c55
-rw-r--r--fs/netfs/misc.c10
-rw-r--r--fs/netfs/objects.c81
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c17
-rw-r--r--fs/netfs/write_collect.c808
-rw-r--r--fs/netfs/write_issue.c684
14 files changed, 1817 insertions, 1475 deletions
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d4d1d799819e..8e6781e0b10b 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -11,7 +11,8 @@ netfs-y := \
main.o \
misc.o \
objects.o \
- output.o
+ write_collect.o \
+ write_issue.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3298c29b5548..a6bb03bea920 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -10,8 +10,11 @@
#include "internal.h"
/*
- * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * Unlock the folios in a read operation. We need to set PG_writeback on any
* folios we're going to write back before we unlock them.
+ *
+ * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
+ * PG_private_2 and do a direct write to the cache from here instead.
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
@@ -48,14 +51,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
- bool folio_started;
+ bool wback_to_cache = false;
+ bool folio_started = false;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
- folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -63,10 +66,16 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_start_fscache(folio);
- folio_started = true;
+ if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
+ &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_start_private_2(folio);
+ folio_started = true;
+ }
+ } else {
+ wback_to_cache |=
+ test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
}
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
@@ -98,6 +107,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
kfree(finfo);
}
folio_mark_uptodate(folio);
+ if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ filemap_dirty_folio(folio->mapping, folio);
+ }
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -116,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
}
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
+ unsigned long long *_start,
+ unsigned long long *_len,
+ unsigned long long i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
@@ -266,7 +282,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
- netfs_stat(&netfs_n_rh_readpage);
+ netfs_stat(&netfs_n_rh_read_folio);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
@@ -450,7 +466,7 @@ retry:
if (!netfs_is_cache_enabled(ctx) &&
netfs_skip_folio_read(folio, pos, len, false)) {
netfs_stat(&netfs_n_rh_write_zskip);
- goto have_folio_no_wait;
+ goto have_folio;
}
rreq = netfs_alloc_request(mapping, file,
@@ -491,10 +507,6 @@ retry:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
- ret = folio_wait_fscache_killable(folio);
- if (ret < 0)
- goto error;
-have_folio_no_wait:
*_folio = folio;
_leave(" = 0");
return 0;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 267b622d923b..1121601536d1 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
+/* Network filesystem high-level buffered write support.
*
* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -26,25 +26,15 @@ enum netfs_how_to_modify {
NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
};
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
-
static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- if (netfs_group && !folio_get_private(folio))
- folio_attach_private(folio, netfs_get_group(netfs_group));
-}
+ void *priv = folio_get_private(folio);
-#if IS_ENABLED(CONFIG_FSCACHE)
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
+ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
}
-#endif
/*
* Decide how we should modify a folio. We might be attempting to do
@@ -63,11 +53,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
bool maybe_trouble)
{
struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
loff_t pos = folio_file_pos(folio);
_enter("");
- if (netfs_folio_group(folio) != netfs_group)
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
if (folio_test_uptodate(folio))
@@ -81,16 +72,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
if (file->f_mode & FMODE_READ)
goto no_write_streaming;
- if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- goto no_write_streaming;
if (netfs_is_cache_enabled(ctx)) {
/* We don't want to get a streaming write on a file that loses
* caching service temporarily because the backing store got
* culled.
*/
- if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
- set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
goto no_write_streaming;
}
@@ -130,6 +117,37 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+/*
+ * Update i_size and estimate the update to i_blocks to reflect the additional
+ * data written into the pagecache until we can find out from the server what
+ * the values actually are.
+ */
+static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t i_size, loff_t pos, size_t copied)
+{
+ blkcnt_t add;
+ size_t gap;
+
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ return;
+ }
+
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(pos, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
+}
+
/**
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
@@ -160,7 +178,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
};
struct netfs_io_request *wreq = NULL;
struct netfs_folio *finfo;
- struct folio *folio;
+ struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
@@ -189,7 +207,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (!is_sync_kiocb(iocb))
wreq->iocb = iocb;
- wreq->cleanup = netfs_cleanup_buffered_write;
+ netfs_stat(&netfs_n_wh_writethrough);
+ } else {
+ netfs_stat(&netfs_n_wh_buffered_write);
}
do {
@@ -230,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
offset = pos & (flen - 1);
part = min_t(size_t, flen - offset, part);
+ /* Wait for writeback to complete. The writeback engine owns
+ * the info in folio->private and may change it until it
+ * removes the WB mark.
+ */
+ if (folio_get_private(folio) &&
+ folio_wait_writeback_killable(folio)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
if (signal_pending(current)) {
ret = written ? -EINTR : -ERESTARTSYS;
goto error_folio_unlock;
@@ -304,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
maybe_trouble = true;
iov_iter_revert(iter, copied);
copied = 0;
+ folio_unlock(folio);
goto retry;
}
netfs_set_group(folio, netfs_group);
@@ -351,41 +382,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
trace_netfs_folio(folio, trace);
/* Update the inode size if we moved the EOF marker */
- i_size = i_size_read(inode);
pos += copied;
- if (pos > i_size) {
- if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
- } else {
- i_size_write(inode, pos);
-#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
-#endif
- }
- }
+ i_size = i_size_read(inode);
+ if (pos > i_size)
+ netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
folio_mark_dirty(folio);
+ folio_unlock(folio);
} else {
- if (folio_test_dirty(folio))
- /* Sigh. mmap. */
- folio_clear_dirty_for_io(folio);
- /* We make multiple writes to the folio... */
- if (!folio_test_writeback(folio)) {
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
- folio_start_fscache(folio);
- if (wreq->iter.count == 0)
- trace_netfs_folio(folio, netfs_folio_trace_wthru);
- else
- trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
- }
- netfs_advance_writethrough(wreq, copied,
- offset + copied == flen);
+ netfs_advance_writethrough(wreq, &wbc, folio, copied,
+ offset + copied == flen,
+ &writethrough);
+ /* Folio unlocked */
}
retry:
- folio_unlock(folio);
folio_put(folio);
folio = NULL;
@@ -393,8 +405,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
} while (iov_iter_count(iter));
out:
+ if (likely(written) && ctx->ops->post_modify)
+ ctx->ops->post_modify(inode);
+
if (unlikely(wreq)) {
- ret2 = netfs_end_writethrough(wreq, iocb);
+ ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
wbc_detach_inode(&wbc);
if (ret2 == -EIOCBQUEUED)
return ret2;
@@ -505,9 +520,11 @@ EXPORT_SYMBOL(netfs_file_write_iter);
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
+ struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
+ struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
int err;
@@ -515,11 +532,13 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
sb_start_pagefault(inode->i_sb);
- if (folio_wait_writeback_killable(folio))
+ if (folio_lock_killable(folio) < 0)
goto out;
- if (folio_lock_killable(folio) < 0)
+ if (folio_wait_writeback_killable(folio)) {
+ ret = VM_FAULT_LOCKED;
goto out;
+ }
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
@@ -527,7 +546,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
goto out;
}
- if (netfs_folio_group(folio) != netfs_group) {
+ group = netfs_folio_group(folio);
+ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
err = filemap_fdatawait_range(inode->i_mapping,
folio_pos(folio),
@@ -551,708 +571,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
netfs_set_group(folio, netfs_group);
file_update_time(file);
+ if (ictx->ops->post_modify)
+ ictx->ops->post_modify(inode);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
return ret;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
-
-/*
- * Kill all the pages in the given range
- */
-static void netfs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- trace_netfs_folio(folio, netfs_folio_trace_kill);
- folio_clear_uptodate(folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void netfs_redirty_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("%llx-%llx", start, start + len - 1);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
- trace_netfs_folio(folio, netfs_folio_trace_redirty);
- filemap_dirty_folio(mapping, folio);
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- balance_dirty_pages_ratelimited(mapping);
-
- _leave("");
-}
-
-/*
- * Completion of write to server
- */
-static void netfs_pages_written_back(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
- struct netfs_folio *finfo;
- struct netfs_group *group = NULL;
- struct folio *folio;
- pgoff_t last;
- int gcount = 0;
-
- XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
-
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
-
- rcu_read_lock();
-
- last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, last) {
- WARN(!folio_test_writeback(folio),
- "bad %zx @%llx page %lx %lx\n",
- wreq->len, wreq->start, folio->index, last);
-
- if ((finfo = netfs_folio_info(folio))) {
- /* Streaming writes cannot be redirtied whilst under
- * writeback, so discard the streaming record.
- */
- folio_detach_private(folio);
- group = finfo->netfs_group;
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_s);
- kfree(finfo);
- } else if ((group = netfs_folio_group(folio))) {
- /* Need to detach the group pointer if the page didn't
- * get redirtied. If it has been redirtied, then it
- * must be within the same group.
- */
- if (folio_test_dirty(folio)) {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- goto end_wb;
- }
- if (folio_trylock(folio)) {
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- goto end_wb;
- }
-
- xas_pause(&xas);
- rcu_read_unlock();
- folio_lock(folio);
- if (!folio_test_dirty(folio)) {
- folio_detach_private(folio);
- gcount++;
- trace_netfs_folio(folio, netfs_folio_trace_clear_g);
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_redirtied);
- }
- folio_unlock(folio);
- rcu_read_lock();
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_clear);
- }
- end_wb:
- if (folio_test_fscache(folio))
- folio_end_fscache(folio);
- xas_advance(&xas, folio_next_index(folio) - 1);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
- netfs_put_group_many(group, gcount);
- _leave("");
-}
-
-/*
- * Deal with the disposition of the folios that are under writeback to close
- * out the operation.
- */
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
-{
- struct address_space *mapping = wreq->mapping;
-
- _enter("");
-
- switch (wreq->error) {
- case 0:
- netfs_pages_written_back(wreq);
- break;
-
- default:
- pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- case -EDQUOT:
- case -ENOSPC:
- netfs_redirty_pages(mapping, wreq->start, wreq->len);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- netfs_kill_pages(mapping, wreq->start, wreq->len);
- break;
- }
-
- if (wreq->error)
- mapping_set_error(mapping, wreq->error);
- if (wreq->netfs_ops->done)
- wreq->netfs_ops->done(wreq);
-}
-
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void netfs_extend_writeback(struct address_space *mapping,
- struct netfs_group *group,
- struct xa_state *xas,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool caching,
- size_t *_len,
- size_t *_top)
-{
- struct netfs_folio *finfo;
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned int i;
- pgoff_t index = (start + *_len) / PAGE_SIZE;
- size_t len;
- void *priv;
- bool stop = true;
-
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio->index != index) {
- xas_reset(xas);
- break;
- }
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- /* Has the folio moved or been split? */
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
-
- stop = false;
- len = folio_size(folio);
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- stop = true;
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group ||
- finfo->dirty_offset > 0) {
- folio_unlock(folio);
- folio_put(folio);
- xas_reset(xas);
- break;
- }
- len = finfo->dirty_len;
- }
-
- *_top += folio_size(folio);
- index += folio_nr_pages(folio);
- *_count -= folio_nr_pages(folio);
- *_len += len;
- if (*_len >= max_len || *_count <= 0)
- stop = true;
-
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- xas_pause(xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_netfs_folio(folio, netfs_folio_trace_store_plus);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
- folio_unlock(folio);
- }
-
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
-}
-
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- struct folio *folio,
- unsigned long long start,
- unsigned long long end)
-{
- struct netfs_io_request *wreq;
- struct netfs_folio *finfo;
- struct netfs_inode *ctx = netfs_inode(mapping->host);
- unsigned long long i_size = i_size_read(&ctx->inode);
- size_t len, max_len;
- bool caching = netfs_is_cache_enabled(ctx);
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
-
- wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
- NETFS_WRITEBACK);
- if (IS_ERR(wreq)) {
- folio_unlock(folio);
- return PTR_ERR(wreq);
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- netfs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- trace_netfs_folio(folio, netfs_folio_trace_store);
-
- len = wreq->len;
- finfo = netfs_folio_info(folio);
- if (finfo) {
- start += finfo->dirty_offset;
- if (finfo->dirty_offset + finfo->dirty_len != len) {
- len = finfo->dirty_len;
- goto cant_expand;
- }
- len = finfo->dirty_len;
- }
-
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len)
- netfs_extend_writeback(mapping, group, xas, &count, start,
- max_len, caching, &len, &wreq->upper_len);
- }
-
-cant_expand:
- len = min_t(unsigned long long, len, i_size - start);
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
- wreq->start = start;
- wreq->len = len;
-
- if (start < i_size) {
- _debug("write back %zx @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- wreq->cleanup = netfs_cleanup_buffered_write;
-
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
- wreq->upper_len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
- if (ret == 0 || ret == -EIOCBQUEUED)
- wbc->nr_to_write -= len / PAGE_SIZE;
- } else {
- _debug("write discard %zx @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- netfs_pages_written_back(wreq);
- ret = 0;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = 1");
- return 1;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static ssize_t netfs_writepages_begin(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- struct xa_state *xas,
- unsigned long long *_start,
- unsigned long long end)
-{
- const struct netfs_folio *finfo;
- struct folio *folio;
- unsigned long long start = *_start;
- ssize_t ret;
- void *priv;
- int skips = 0;
-
- _enter("%llx,%llx,", start, end);
-
-search_again:
- /* Find the first dirty page in the group. */
- rcu_read_lock();
-
- for (;;) {
- folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
- if (xas_retry(xas, folio) || xa_is_value(folio))
- continue;
- if (!folio)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(xas);
- continue;
- }
-
- if (unlikely(folio != xas_reload(xas))) {
- folio_put(folio);
- xas_reset(xas);
- continue;
- }
-
- /* Skip any dirty folio that's not in the group of interest. */
- priv = folio_get_private(folio);
- if ((const struct netfs_group *)priv != group) {
- finfo = netfs_folio_info(folio);
- if (finfo->netfs_group != group) {
- folio_put(folio);
- continue;
- }
- }
-
- xas_pause(xas);
- break;
- }
- rcu_read_unlock();
- if (!folio)
- return 0;
-
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio->index);
-
- /* At this point we hold neither the i_pages lock nor the page lock:
- * the page may be truncated or invalidated (changing page->mapping to
- * NULL), or even swizzled back from swapper_space to tmpfs file
- * mapping
- */
-lock_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- return ret;
- } else {
- if (!folio_trylock(folio))
- goto search_again;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- goto search_again;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto lock_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- ret = 0;
- goto out;
- }
- skips++;
- }
- goto search_again;
- }
-
- ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
- folio, start, end);
-out:
- if (ret > 0)
- *_start = start + ret;
- _leave(" = %zd [%llx]", ret, *_start);
- return ret;
-}
-
-/*
- * Write a region of pages back to the server
- */
-static int netfs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- struct netfs_group *group,
- unsigned long long *_start,
- unsigned long long end)
-{
- ssize_t ret;
-
- XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
-
- do {
- ret = netfs_writepages_begin(mapping, wbc, group, &xas,
- _start, end);
- if (ret > 0 && wbc->nr_to_write > 0)
- cond_resched();
- } while (ret > 0 && wbc->nr_to_write > 0);
-
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * write some of the pending data back to the server
- */
-int netfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct netfs_group *group = NULL;
- loff_t start, end;
- int ret;
-
- _enter("");
-
- /* We have to be careful as we can end up racing with setattr()
- * truncating the pagecache since the caller doesn't take a lock here
- * to prevent it.
- */
-
- if (wbc->range_cyclic && mapping->writeback_index) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (ret < 0)
- goto out;
-
- if (wbc->nr_to_write <= 0) {
- mapping->writeback_index = start / PAGE_SIZE;
- goto out;
- }
-
- start = 0;
- end = mapping->writeback_index * PAGE_SIZE;
- mapping->writeback_index = 0;
- ret = netfs_writepages_region(mapping, wbc, group, &start, end);
- if (ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- start = 0;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, LLONG_MAX);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = start / PAGE_SIZE;
- } else {
- start = wbc->range_start;
- ret = netfs_writepages_region(mapping, wbc, group,
- &start, wbc->range_end);
- }
-
-out:
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_writepages);
-
-/*
- * Deal with the disposition of a laundered folio.
- */
-static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
-{
- if (wreq->error) {
- pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
- mapping_set_error(wreq->mapping, wreq->error);
- }
-}
-
-/**
- * netfs_launder_folio - Clean up a dirty folio that's being invalidated
- * @folio: The folio to clean
- *
- * This is called to write back a folio that's being invalidated when an inode
- * is getting torn down. Ideally, writepages would be used instead.
- */
-int netfs_launder_folio(struct folio *folio)
-{
- struct netfs_io_request *wreq;
- struct address_space *mapping = folio->mapping;
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct netfs_group *group = netfs_folio_group(folio);
- struct bio_vec bvec;
- unsigned long long i_size = i_size_read(mapping->host);
- unsigned long long start = folio_pos(folio);
- size_t offset = 0, len;
- int ret = 0;
-
- if (finfo) {
- offset = finfo->dirty_offset;
- start += offset;
- len = finfo->dirty_len;
- } else {
- len = folio_size(folio);
- }
- len = min_t(unsigned long long, len, i_size - start);
-
- wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
- if (IS_ERR(wreq)) {
- ret = PTR_ERR(wreq);
- goto out;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- goto out_put;
-
- trace_netfs_folio(folio, netfs_folio_trace_launder);
-
- _debug("launder %llx-%llx", start, start + len - 1);
-
- /* Speculatively write to the cache. We have to fix this up later if
- * the store fails.
- */
- wreq->cleanup = netfs_cleanup_launder_folio;
-
- bvec_set_folio(&bvec, folio, len, offset);
- iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
-
-out_put:
- folio_detach_private(folio);
- netfs_put_group(group);
- kfree(finfo);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-out:
- folio_wait_fscache(folio);
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bee047e20f5d..608ba6416919 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -34,6 +34,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
unsigned long long start = iocb->ki_pos;
unsigned long long end = start + iov_iter_count(iter);
ssize_t ret, n;
+ size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
_enter("");
@@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
_debug("uw %llx-%llx", start, end);
- wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
- start, end - start,
- iocb->ki_flags & IOCB_DIRECT ?
- NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
if (IS_ERR(wreq))
return PTR_ERR(wreq);
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write));
+
{
/* If this is an async op and we're not using a bounce buffer,
* we have to save the source buffer as the iterator is only
@@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
* request.
*/
if (async || user_backed_iter(iter)) {
- n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
if (n < 0) {
ret = n;
goto out;
@@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
- wreq->len = iov_iter_count(&wreq->iter);
} else {
wreq->iter = *iter;
}
@@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
wreq->io_iter = wreq->iter;
}
+ __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -87,10 +93,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
if (async)
wreq->iocb = iocb;
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
- iocb->ki_flags & IOCB_DIRECT ?
- netfs_write_trace_dio_write :
- netfs_write_trace_unbuffered_write);
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
@@ -100,9 +103,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
-
+ smp_rmb(); /* Read error/transferred after RIP flag */
ret = wreq->error;
- _debug("waited = %zd", ret);
if (ret == 0) {
ret = wreq->transferred;
iocb->ki_pos += ret;
@@ -132,18 +134,20 @@ out:
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct netfs_inode *ictx = netfs_inode(inode);
- unsigned long long end;
ssize_t ret;
+ loff_t pos = iocb->ki_pos;
+ unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
trace_netfs_write_iter(iocb, from);
- netfs_stat(&netfs_n_rh_dio_write);
+ netfs_stat(&netfs_n_wh_dio_write);
ret = netfs_start_io_direct(inode);
if (ret < 0)
@@ -157,7 +161,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = file_update_time(file);
if (ret < 0)
goto out;
- ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* We could block if there are any pages in the range. */
+ ret = -EAGAIN;
+ if (filemap_range_has_page(mapping, pos, end))
+ if (filemap_invalidate_inode(inode, true, pos, end))
+ goto out;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ ret = filemap_invalidate_inode(inode, true, pos, end);
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 43a651ed8264..38637e5c9b57 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -166,6 +166,7 @@ struct fscache_write_request {
loff_t start;
size_t len;
bool set_bits;
+ bool using_pgpriv2;
netfs_io_terminated_t term_func;
void *term_func_priv;
};
@@ -182,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, last) {
- end_page_fscache(page);
+ folio_end_private_2(page_folio(page));
}
rcu_read_unlock();
}
@@ -197,8 +198,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
- wreq->set_bits);
+ if (wreq->using_pgpriv2)
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+ wreq->set_bits);
if (wreq->term_func)
wreq->term_func(wreq->term_func_priv, transferred_or_error,
@@ -212,7 +214,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
loff_t start, size_t len, loff_t i_size,
netfs_io_terminated_t term_func,
void *term_func_priv,
- bool cond)
+ bool using_pgpriv2, bool cond)
{
struct fscache_write_request *wreq;
struct netfs_cache_resources *cres;
@@ -230,6 +232,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
wreq->mapping = mapping;
wreq->start = start;
wreq->len = len;
+ wreq->using_pgpriv2 = using_pgpriv2;
wreq->set_bits = cond;
wreq->term_func = term_func;
wreq->term_func_priv = term_func_priv;
@@ -257,7 +260,8 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(mapping, start, len, cond);
+ if (using_pgpriv2)
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ec7045d24400..95e281a8af78 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -37,6 +37,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
+extern mempool_t netfs_request_pool;
+extern mempool_t netfs_subrequest_pool;
#ifdef CONFIG_PROC_FS
static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
@@ -91,22 +93,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
- * output.c
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what);
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
-
-/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
extern atomic_t netfs_n_rh_dio_read;
-extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
-extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_read_folio;
extern atomic_t netfs_n_rh_rreq;
extern atomic_t netfs_n_rh_sreq;
extern atomic_t netfs_n_rh_download;
@@ -123,6 +115,10 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_buffered_write;
+extern atomic_t netfs_n_wh_writethrough;
+extern atomic_t netfs_n_wh_dio_write;
+extern atomic_t netfs_n_wh_writepages;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@@ -149,6 +145,33 @@ static inline void netfs_stat_d(atomic_t *stat)
#endif
/*
+ * write_collect.c
+ */
+int netfs_folio_written_back(struct folio *folio);
+void netfs_write_collection_worker(struct work_struct *work);
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
+
+/*
+ * write_issue.c
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin);
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq);
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
+
+/*
* Miscellaneous functions.
*/
static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
@@ -168,7 +191,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
*/
static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
{
- if (netfs_group)
+ if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
refcount_inc(&netfs_group->ref);
return netfs_group;
}
@@ -178,7 +201,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou
*/
static inline void netfs_put_group(struct netfs_group *netfs_group)
{
- if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_dec_and_test(&netfs_group->ref))
netfs_group->free(netfs_group);
}
@@ -187,7 +212,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group)
*/
static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
{
- if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ if (netfs_group &&
+ netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+ refcount_sub_and_test(nr, &netfs_group->ref))
netfs_group->free(netfs_group);
}
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4261ad6c55b6..c93851b98368 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -99,145 +99,6 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
}
/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct folio *folio;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- if (xas_retry(&xas, folio))
- continue;
-
- /* We might have multiple writes from the same huge
- * folio, but we mustn't unlock a folio more than once.
- */
- if (have_unlocked && folio->index <= unlocked)
- continue;
- unlocked = folio_next_index(folio) - 1;
- trace_netfs_folio(folio, netfs_folio_trace_end_copy);
- folio_end_fscache(folio);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_io_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_copy_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false,
- netfs_sreq_trace_put_no_copy);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false,
- netfs_sreq_trace_put_merged);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- subreq->len, rreq->i_size, true);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_copy_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_copy_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
-{
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
-}
-
-/*
* Handle a short read.
*/
static void netfs_rreq_short_read(struct netfs_io_request *rreq,
@@ -352,8 +213,13 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
unsigned int i;
size_t transferred = 0;
- for (i = 0; i < rreq->direct_bv_count; i++)
+ for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->error || subreq->transferred == 0)
@@ -409,9 +275,6 @@ again:
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
- if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq);
-
netfs_rreq_completed(rreq, was_async);
}
@@ -618,7 +481,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
set:
if (subreq->len > rreq->len)
- pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
rreq->debug_id, subreq->debug_index,
subreq->len, rreq->len);
@@ -643,8 +506,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
- struct iov_iter *io_iter,
- unsigned int *_debug_index)
+ struct iov_iter *io_iter)
{
struct netfs_io_subrequest *subreq;
enum netfs_io_source source;
@@ -653,11 +515,10 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
if (!subreq)
return false;
- subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
@@ -707,7 +568,6 @@ subreq_failed:
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
struct iov_iter io_iter;
- unsigned int debug_index = 0;
int ret;
_enter("R=%x %llx-%llx",
@@ -733,12 +593,12 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
- _debug("submit %llx + %zx >= %llx",
+ _debug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (rreq->origin == NETFS_DIO_READ &&
rreq->start + rreq->submitted >= rreq->i_size)
break;
- if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ if (!netfs_rreq_submit_slice(rreq, &io_iter))
break;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5e77618a7940..5f0f438e5d21 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/mempool.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -23,6 +24,11 @@ unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+static struct kmem_cache *netfs_request_slab;
+static struct kmem_cache *netfs_subrequest_slab;
+mempool_t netfs_request_pool;
+mempool_t netfs_subrequest_pool;
+
#ifdef CONFIG_PROC_FS
LIST_HEAD(netfs_io_requests);
DEFINE_SPINLOCK(netfs_proc_lock);
@@ -31,9 +37,9 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_COPY_TO_CACHE] = "CC",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITETHROUGH] = "WT",
- [NETFS_LAUNDER_WRITE] = "LW",
[NETFS_UNBUFFERED_WRITE] = "UW",
[NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
@@ -56,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
@@ -98,25 +104,54 @@ static int __init netfs_init(void)
{
int ret = -ENOMEM;
+ netfs_request_slab = kmem_cache_create("netfs_request",
+ sizeof(struct netfs_io_request), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_request_slab)
+ goto error_req;
+
+ if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0)
+ goto error_reqpool;
+
+ netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
+ sizeof(struct netfs_io_subrequest), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+ NULL);
+ if (!netfs_subrequest_slab)
+ goto error_subreq;
+
+ if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
+ goto error_subreqpool;
+
if (!proc_mkdir("fs/netfs", NULL))
- goto error;
+ goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
- goto error_proc;
+ goto error_procfile;
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
- goto error_proc;
+ goto error_procfile;
#endif
ret = fscache_init();
if (ret < 0)
- goto error_proc;
+ goto error_fscache;
return 0;
-error_proc:
+error_fscache:
+error_procfile:
remove_proc_entry("fs/netfs", NULL);
-error:
+error_proc:
+ mempool_exit(&netfs_subrequest_pool);
+error_subreqpool:
+ kmem_cache_destroy(netfs_subrequest_slab);
+error_subreq:
+ mempool_exit(&netfs_request_pool);
+error_reqpool:
+ kmem_cache_destroy(netfs_request_slab);
+error_req:
return ret;
}
fs_initcall(netfs_init);
@@ -125,5 +160,9 @@ static void __exit netfs_exit(void)
{
fscache_exit();
remove_proc_entry("fs/netfs", NULL);
+ mempool_exit(&netfs_subrequest_pool);
+ kmem_cache_destroy(netfs_subrequest_slab);
+ mempool_exit(&netfs_request_pool);
+ kmem_cache_destroy(netfs_request_slab);
}
module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 90051ced8e2a..bc1fc54fb724 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -177,13 +177,11 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
*/
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct netfs_folio *finfo = NULL;
+ struct netfs_folio *finfo;
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
- folio_wait_fscache(folio);
-
if (!folio_test_private(folio))
return;
@@ -248,12 +246,6 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_private(folio))
return false;
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
-
fscache_note_page_release(netfs_i_cookie(ctx));
return true;
}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 610ceb5bd86c..c90d482b1650 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -6,6 +6,8 @@
*/
#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
#include "internal.h"
/*
@@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
+ struct kmem_cache *cache = mempool->pool_data;
bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
origin == NETFS_DIO_READ ||
origin == NETFS_DIO_WRITE);
bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
- GFP_KERNEL);
- if (!rreq)
- return ERR_PTR(-ENOMEM);
+ for (;;) {
+ rreq = mempool_alloc(mempool, GFP_KERNEL);
+ if (rreq)
+ break;
+ msleep(10);
+ }
+ memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
rreq->upper_len = len;
@@ -40,19 +47,27 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->inode = inode;
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
+ rreq->wsize = INT_MAX;
+ spin_lock_init(&rreq->lock);
+ INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
+ INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (cached)
+ if (cached) {
__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
+ /* Filesystem uses deprecated PG_private_2 marking. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+ }
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
- kfree(rreq);
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
return ERR_PTR(ret);
}
}
@@ -74,6 +89,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
@@ -82,6 +99,25 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
+
+ for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
+ stream = &rreq->io_streams[s];
+ while (!list_empty(&stream->subrequests)) {
+ subreq = list_first_entry(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
+ }
+}
+
+static void netfs_free_request_rcu(struct rcu_head *rcu)
+{
+ struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
+
+ mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
+ netfs_stat_d(&netfs_n_rh_rreq);
}
static void netfs_free_request(struct work_struct *work)
@@ -106,8 +142,7 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
- kfree_rcu(rreq, rcu);
- netfs_stat_d(&netfs_n_rh_rreq);
+ call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
@@ -139,19 +174,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
-
- subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
- sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_WORK(&subreq->work, NULL);
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
- netfs_stat(&netfs_n_rh_sreq);
+ mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
+ struct kmem_cache *cache = mempool->pool_data;
+
+ for (;;) {
+ subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
+ GFP_KERNEL);
+ if (subreq)
+ break;
+ msleep(10);
}
+ memset(subreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&subreq->work, NULL);
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
return subreq;
}
@@ -173,7 +214,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
if (rreq->netfs_ops->free_subrequest)
rreq->netfs_ops->free_subrequest(subreq);
- kfree(subreq);
+ mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
}
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
deleted file mode 100644
index 625eb68f3e5a..000000000000
--- a/fs/netfs/output.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
- *
- * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
-#include "internal.h"
-
-/**
- * netfs_create_write_request - Create a write operation.
- * @wreq: The write request this is storing from.
- * @dest: The destination type
- * @start: Start of the region this write will modify
- * @len: Length of the modification
- * @worker: The worker function to handle the write(s)
- *
- * Allocate a write operation, set it up and add it to the list on a write
- * request.
- */
-struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
- enum netfs_io_source dest,
- loff_t start, size_t len,
- work_func_t worker)
-{
- struct netfs_io_subrequest *subreq;
-
- subreq = netfs_alloc_subrequest(wreq);
- if (subreq) {
- INIT_WORK(&subreq->work, worker);
- subreq->source = dest;
- subreq->start = start;
- subreq->len = len;
- subreq->debug_index = wreq->subreq_counter++;
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write);
- break;
- default:
- BUG();
- }
-
- subreq->io_iter = wreq->io_iter;
- iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
- iov_iter_truncate(&subreq->io_iter, subreq->len);
-
- trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
- refcount_read(&subreq->ref),
- netfs_sreq_trace_new);
- atomic_inc(&wreq->nr_outstanding);
- list_add_tail(&subreq->rreq_link, &wreq->subrequests);
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- }
-
- return subreq;
-}
-EXPORT_SYMBOL(netfs_create_write_request);
-
-/*
- * Process a completed write request once all the component operations have
- * been completed.
- */
-static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- size_t transferred = 0;
-
- _enter("R=%x[]", wreq->debug_id);
-
- trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
- wreq->transferred = transferred;
-
- list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
- if (!subreq->error)
- continue;
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- /* Depending on the type of failure, this may prevent
- * writeback completion unless we're in disconnected
- * mode.
- */
- if (!wreq->error)
- wreq->error = subreq->error;
- break;
-
- case NETFS_WRITE_TO_CACHE:
- /* Failure doesn't prevent writeback completion unless
- * we're in disconnected mode.
- */
- if (subreq->error != -ENOBUFS)
- ctx->ops->invalidate_cache(wreq);
- break;
-
- default:
- WARN_ON_ONCE(1);
- if (!wreq->error)
- wreq->error = -EIO;
- return;
- }
- }
-
- wreq->cleanup(wreq);
-
- if (wreq->origin == NETFS_DIO_WRITE &&
- wreq->mapping->nrpages) {
- pgoff_t first = wreq->start >> PAGE_SHIFT;
- pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
- invalidate_inode_pages2_range(wreq->mapping, first, last);
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_end(wreq->inode);
-
- _debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
- wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (wreq->iocb) {
- wreq->iocb->ki_pos += transferred;
- if (wreq->iocb->ki_complete)
- wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : transferred);
- }
-
- netfs_clear_subrequests(wreq, was_async);
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = _op;
- struct netfs_io_request *wreq = subreq->rreq;
- unsigned int u;
-
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_done);
- break;
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_done);
- break;
- case NETFS_INVALID_WRITE:
- break;
- default:
- BUG();
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(wreq, subreq, transferred_or_error,
- netfs_fail_write);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq excess write: R%x[%x] %zd > %zu - %zu",
- wreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
-
- if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
- wreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->io_iter.iter_type);
-
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&wreq->nr_outstanding);
- if (u == 0)
- netfs_write_terminated(wreq, was_async);
- else if (u == 1)
- wake_up_var(&wreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- goto out;
-
-failed:
- switch (subreq->source) {
- case NETFS_WRITE_TO_CACHE:
- netfs_stat(&netfs_n_wh_write_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
- break;
- case NETFS_UPLOAD_TO_SERVER:
- netfs_stat(&netfs_n_wh_upload_failed);
- set_bit(NETFS_RREQ_FAILED, &wreq->flags);
- wreq->error = subreq->error;
- break;
- default:
- break;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_write_subrequest_terminated);
-
-static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
-{
- struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_cache_resources *cres = &wreq->cache_resources;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
-
- cres->ops->write(cres, subreq->start, &subreq->io_iter,
- netfs_write_subrequest_terminated, subreq);
-}
-
-static void netfs_write_to_cache_op_worker(struct work_struct *work)
-{
- struct netfs_io_subrequest *subreq =
- container_of(work, struct netfs_io_subrequest, work);
-
- netfs_write_to_cache_op(subreq);
-}
-
-/**
- * netfs_queue_write_request - Queue a write request for attention
- * @subreq: The write request to be queued
- *
- * Queue the specified write request for processing by a worker thread. We
- * pass the caller's ref on the request to the worker thread.
- */
-void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
-{
- if (!queue_work(system_unbound_wq, &subreq->work))
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
-}
-EXPORT_SYMBOL(netfs_queue_write_request);
-
-/*
- * Set up a op for writing to the cache.
- */
-static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
-{
- struct netfs_cache_resources *cres = &wreq->cache_resources;
- struct netfs_io_subrequest *subreq;
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
- struct fscache_cookie *cookie = netfs_i_cookie(ctx);
- loff_t start = wreq->start;
- size_t len = wreq->len;
- int ret;
-
- if (!fscache_cookie_enabled(cookie)) {
- clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
- return;
- }
-
- _debug("write to cache");
- ret = fscache_begin_write_operation(cres, cookie);
- if (ret < 0)
- return;
-
- ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
- i_size_read(wreq->inode), true);
- if (ret < 0)
- return;
-
- subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
- netfs_write_to_cache_op_worker);
- if (!subreq)
- return;
-
- netfs_write_to_cache_op(subreq);
-}
-
-/*
- * Begin the process of writing out a chunk of data.
- *
- * We are given a write request that holds a series of dirty regions and
- * (partially) covers a sequence of folios, all of which are present. The
- * pages must have been marked as writeback as appropriate.
- *
- * We need to perform the following steps:
- *
- * (1) If encrypting, create an output buffer and encrypt each block of the
- * data into it, otherwise the output buffer will point to the original
- * folios.
- *
- * (2) If the data is to be cached, set up a write op for the entire output
- * buffer to the cache, if the cache wants to accept it.
- *
- * (3) If the data is to be uploaded (ie. not merely cached):
- *
- * (a) If the data is to be compressed, create a compression buffer and
- * compress the data into it.
- *
- * (b) For each destination we want to upload to, set up write ops to write
- * to that destination. We may need multiple writes if the data is not
- * contiguous or the span exceeds wsize for a server.
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
- enum netfs_write_trace what)
-{
- struct netfs_inode *ctx = netfs_inode(wreq->inode);
-
- _enter("R=%x %llx-%llx f=%lx",
- wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
- wreq->flags);
-
- trace_netfs_write(wreq, what);
- if (wreq->len == 0 || wreq->iter.count == 0) {
- pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
- return -EIO;
- }
-
- if (wreq->origin == NETFS_DIO_WRITE)
- inode_dio_begin(wreq->inode);
-
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
-
- /* Start the encryption/compression going. We can do that in the
- * background whilst we generate a list of write ops that we want to
- * perform.
- */
- // TODO: Encrypt or compress the region as appropriate
-
- /* We need to write all of the region to the cache */
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
- netfs_set_up_write_to_cache(wreq);
-
- /* However, we don't necessarily write all of the region to the server.
- * Caching of reads is being managed this way also.
- */
- if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (!may_wait)
- return -EIOCBQUEUED;
-
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- return wreq->error;
-}
-
-/*
- * Begin a write operation for writing through the pagecache.
- */
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
-{
- struct netfs_io_request *wreq;
- struct file *file = iocb->ki_filp;
-
- wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
- NETFS_WRITETHROUGH);
- if (IS_ERR(wreq))
- return wreq;
-
- trace_netfs_write(wreq, netfs_write_trace_writethrough);
-
- __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
- iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
- wreq->io_iter = wreq->iter;
-
- /* ->outstanding > 0 carries a ref */
- netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&wreq->nr_outstanding, 1);
- return wreq;
-}
-
-static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
-{
- struct netfs_inode *ictx = netfs_inode(wreq->inode);
- unsigned long long start;
- size_t len;
-
- if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
- return;
-
- start = wreq->start + wreq->submitted;
- len = wreq->iter.count - wreq->submitted;
- if (!final) {
- len /= wreq->wsize; /* Round to number of maximum packets */
- len *= wreq->wsize;
- }
-
- ictx->ops->create_write_requests(wreq, start, len);
- wreq->submitted += len;
-}
-
-/*
- * Advance the state of the write operation used when writing through the
- * pagecache. Data has been copied into the pagecache that we need to append
- * to the request. If we've added more than wsize then we need to create a new
- * subrequest.
- */
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
-{
- _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
-
- wreq->iter.count += copied;
- wreq->io_iter.count += copied;
- if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
- netfs_submit_writethrough(wreq, false);
-
- return wreq->error;
-}
-
-/*
- * End a write operation used when writing through the pagecache.
- */
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
-{
- int ret = -EIOCBQUEUED;
-
- _enter("ic=%zu sb=%zu ws=%u",
- wreq->iter.count, wreq->submitted, wreq->wsize);
-
- if (wreq->submitted < wreq->io_iter.count)
- netfs_submit_writethrough(wreq, true);
-
- if (atomic_dec_and_test(&wreq->nr_outstanding))
- netfs_write_terminated(wreq, false);
-
- if (is_sync_kiocb(iocb)) {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
-
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- return ret;
-}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index deeba9f9dcf5..0892768eea32 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -10,9 +10,8 @@
#include "internal.h"
atomic_t netfs_n_rh_dio_read;
-atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
-atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_read_folio;
atomic_t netfs_n_rh_rreq;
atomic_t netfs_n_rh_sreq;
atomic_t netfs_n_rh_download;
@@ -29,6 +28,10 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_buffered_write;
+atomic_t netfs_n_wh_writethrough;
+atomic_t netfs_n_wh_dio_write;
+atomic_t netfs_n_wh_writepages;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@@ -39,13 +42,17 @@ atomic_t netfs_n_wh_write_failed;
int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
- atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
- atomic_read(&netfs_n_rh_readpage),
+ atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n",
+ atomic_read(&netfs_n_wh_buffered_write),
+ atomic_read(&netfs_n_wh_writethrough),
+ atomic_read(&netfs_n_wh_dio_write),
+ atomic_read(&netfs_n_wh_writepages));
seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
new file mode 100644
index 000000000000..60112e4b2c5e
--- /dev/null
+++ b/fs/netfs/write_collect.c
@@ -0,0 +1,808 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write subrequest result collection, assessment
+ * and retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING 0x01 /* A front op was still pending */
+#define SOME_EMPTY 0x02 /* One of more streams are empty */
+#define ALL_EMPTY 0x04 /* All streams are empty */
+#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */
+#define NEED_REASSESS 0x10 /* Need to loop round and reassess */
+#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */
+#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */
+#define BUFFERED 0x80 /* The pagecache needs cleaning up */
+#define NEED_RETRY 0x100 /* A front op requests retrying */
+#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */
+
+/*
+ * Successful completion of write of a folio to the server and/or cache. Note
+ * that we are not allowed to lock the folio here on pain of deadlocking with
+ * truncate.
+ */
+int netfs_folio_written_back(struct folio *folio)
+{
+ enum netfs_folio_trace why = netfs_folio_trace_clear;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ int gcount = 0;
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under writeback,
+ * so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ kfree(finfo);
+ why = netfs_folio_trace_clear_s;
+ goto end_wb;
+ }
+
+ if ((group = netfs_folio_group(folio))) {
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ why = netfs_folio_trace_clear_cc;
+ folio_detach_private(folio);
+ goto end_wb;
+ }
+
+ /* Need to detach the group pointer if the page didn't get
+ * redirtied. If it has been redirtied, then it must be within
+ * the same group.
+ */
+ why = netfs_folio_trace_redirtied;
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ why = netfs_folio_trace_clear_g;
+ }
+ }
+
+end_wb:
+ trace_netfs_folio(folio, why);
+ folio_end_writeback(folio);
+ return gcount;
+}
+
+/*
+ * Get hold of a folio we have under writeback. We don't want to get the
+ * refcount on it.
+ */
+static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
+{
+ XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
+ struct folio *folio;
+
+ rcu_read_lock();
+
+ for (;;) {
+ xas_reset(&xas);
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (!folio || xa_is_value(folio))
+ kdebug("R=%08x: folio %lx (%llx) not present",
+ wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
+ BUG_ON(!folio || xa_is_value(folio));
+
+ if (folio == xas_reload(&xas))
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (WARN_ONCE(!folio_test_writeback(folio),
+ "R=%08x: folio %lx is not under writeback\n",
+ wreq->debug_id, folio->index)) {
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+ }
+ return folio;
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+ unsigned long long collected_to,
+ unsigned int *notes)
+{
+ for (;;) {
+ struct folio *folio;
+ struct netfs_folio *finfo;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ finfo = netfs_folio_info(folio);
+ flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+ trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+ if (fpos + fsize > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, fpos + fsize,
+ netfs_contig_trace_unlock);
+ wreq->contiguity = fpos + fsize;
+ }
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ wreq->cleaned_to = fpos + fsize;
+ *notes |= MADE_PROGRESS;
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+}
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct list_head *next;
+
+ _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+ if (list_empty(&stream->subrequests))
+ return;
+
+ if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ wreq->netfs_ops->retry_request)
+ wreq->netfs_ops->retry_request(wreq, stream);
+
+ if (unlikely(stream->failed))
+ return;
+
+ /* If there's no renegotiation to do, just resend each failed subreq. */
+ if (!stream->prepare_write) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ }
+ }
+ return;
+ }
+
+ next = stream->subrequests.next;
+
+ do {
+ struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+ unsigned long long start, len;
+ size_t part;
+ bool boundary = false;
+
+ /* Go through the stream and find the next span of contiguous
+ * data that we then rejig (cifs, for example, needs the wsize
+ * renegotiating) and reissue.
+ */
+ from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ to = from;
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ return;
+
+ list_for_each_continue(next, &stream->subrequests) {
+ subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+ if (subreq->start + subreq->transferred != start + len ||
+ test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ to = subreq;
+ len += to->len;
+ }
+
+ /* Work through the sublist. */
+ subreq = from;
+ list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+ if (!len)
+ break;
+ /* Renegotiate max_len (wsize) */
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = part;
+ subreq->start = start;
+ subreq->transferred = 0;
+ len -= part;
+ start += part;
+ if (len && subreq == to &&
+ __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+ boundary = true;
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_reissue_write(stream, subreq);
+ if (subreq == to)
+ break;
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess; if we used the same number, then we're done.
+ */
+ if (!len) {
+ if (subreq == to)
+ continue;
+ list_for_each_entry_safe_from(subreq, tmp,
+ &stream->subrequests, rreq_link) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ if (subreq == to)
+ break;
+ }
+ continue;
+ }
+
+ /* We ran out of subrequests, so we need to allocate some more
+ * and insert them after.
+ */
+ do {
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = to->source;
+ subreq->start = start;
+ subreq->max_len = len;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
+ subreq->stream_nr = to->stream_nr;
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+ list_add(&subreq->rreq_link, &to->rreq_link);
+ to = list_next_entry(to, rreq_link);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = min(len, wreq->wsize);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ stream->prepare_write(subreq);
+
+ part = min(len, subreq->max_len);
+ subreq->len = subreq->transferred + part;
+ len -= part;
+ start += part;
+ if (!len && boundary) {
+ __set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+ boundary = false;
+ }
+
+ netfs_reissue_write(stream, subreq);
+ if (!len)
+ break;
+
+ } while (len);
+
+ } while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it. If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+static void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream;
+ int s;
+
+ /* Wait for all outstanding I/O to quiesce before performing retries as
+ * we may need to renegotiate the I/O sizes.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ }
+ }
+
+ // TODO: Enc: Fetch changed partial pages
+ // TODO: Enc: Reencrypt content if needed.
+ // TODO: Enc: Wind back transferred point.
+ // TODO: Enc: Mark cache pages for retry.
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->need_retry) {
+ stream->need_retry = false;
+ netfs_retry_write_stream(wreq, stream);
+ }
+ }
+}
+
+/*
+ * Collect and assess the results of various write subrequests. We may need to
+ * retry some of the results - or even do an RMW cycle for content crypto.
+ *
+ * Note that we have a number of parallel, overlapping lists of subrequests,
+ * one to the server and one to the local cache for example, which may not be
+ * the same size or starting position and may not even correspond in boundary
+ * alignment.
+ */
+static void netfs_collect_write_results(struct netfs_io_request *wreq)
+{
+ struct netfs_io_subrequest *front, *remove;
+ struct netfs_io_stream *stream;
+ unsigned long long collected_to;
+ unsigned int notes;
+ int s;
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ trace_netfs_collect(wreq);
+ trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
+
+reassess_streams:
+ smp_rmb();
+ collected_to = ULLONG_MAX;
+ if (wreq->origin == NETFS_WRITEBACK)
+ notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
+ else if (wreq->origin == NETFS_WRITETHROUGH)
+ notes = ALL_EMPTY | BUFFERED;
+ else
+ notes = ALL_EMPTY;
+
+ /* Remove completed subrequests from the front of the streams and
+ * advance the completion point on each stream. We stop when we hit
+ * something that's in progress. The issuer thread may be adding stuff
+ * to the tail whilst we're doing this.
+ *
+ * We must not, however, merge in discontiguities that span whole
+ * folios that aren't under writeback. This is made more complicated
+ * by the folios in the gap being of unpredictable sizes - if they even
+ * exist - but we don't want to look them up.
+ */
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ loff_t rstart, rend;
+
+ stream = &wreq->io_streams[s];
+ /* Read active flag before list pointers */
+ if (!smp_load_acquire(&stream->active))
+ continue;
+
+ front = stream->front;
+ while (front) {
+ trace_netfs_collect_sreq(wreq, front);
+ //_debug("sreq [%x] %llx %zx/%zx",
+ // front->debug_index, front->start, front->transferred, front->len);
+
+ /* Stall if there may be a discontinuity. */
+ rstart = round_down(front->start, PAGE_SIZE);
+ if (rstart > wreq->contiguity) {
+ if (wreq->contiguity > stream->collected_to) {
+ trace_netfs_collect_gap(wreq, stream,
+ wreq->contiguity, 'D');
+ stream->collected_to = wreq->contiguity;
+ }
+ notes |= REASSESS_DISCONTIG;
+ break;
+ }
+ rend = round_up(front->start + front->len, PAGE_SIZE);
+ if (rend > wreq->contiguity) {
+ trace_netfs_collect_contig(wreq, rend,
+ netfs_contig_trace_collect);
+ wreq->contiguity = rend;
+ if (notes & REASSESS_DISCONTIG)
+ notes |= NEED_REASSESS;
+ }
+ notes &= ~MAYBE_DISCONTIG;
+
+ /* Stall if the front is still undergoing I/O. */
+ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ notes |= HIT_PENDING;
+ break;
+ }
+ smp_rmb(); /* Read counters after I-P flag. */
+
+ if (stream->failed) {
+ stream->collected_to = front->start + front->len;
+ notes |= MADE_PROGRESS | SAW_FAILURE;
+ goto cancel;
+ }
+ if (front->start + front->transferred > stream->collected_to) {
+ stream->collected_to = front->start + front->transferred;
+ stream->transferred = stream->collected_to - wreq->start;
+ notes |= MADE_PROGRESS;
+ }
+ if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+ stream->failed = true;
+ stream->error = front->error;
+ if (stream->source == NETFS_UPLOAD_TO_SERVER)
+ mapping_set_error(wreq->mapping, front->error);
+ notes |= NEED_REASSESS | SAW_FAILURE;
+ break;
+ }
+ if (front->transferred < front->len) {
+ stream->need_retry = true;
+ notes |= NEED_RETRY | MADE_PROGRESS;
+ break;
+ }
+
+ cancel:
+ /* Remove if completely consumed. */
+ spin_lock(&wreq->lock);
+
+ remove = front;
+ list_del_init(&front->rreq_link);
+ front = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ stream->front = front;
+ if (!front) {
+ unsigned long long jump_to = atomic64_read(&wreq->issued_to);
+
+ if (stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
+ stream->collected_to = jump_to;
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+ netfs_put_subrequest(remove, false,
+ notes & SAW_FAILURE ?
+ netfs_sreq_trace_put_cancel :
+ netfs_sreq_trace_put_done);
+ }
+
+ if (front)
+ notes &= ~ALL_EMPTY;
+ else
+ notes |= SOME_EMPTY;
+
+ if (stream->collected_to < collected_to)
+ collected_to = stream->collected_to;
+ }
+
+ if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
+ wreq->collected_to = collected_to;
+
+ /* If we have an empty stream, we need to jump it forward over any gap
+ * otherwise the collection point will never advance.
+ *
+ * Note that the issuer always adds to the stream with the lowest
+ * so-far submitted start, so if we see two consecutive subreqs in one
+ * stream with nothing between then in another stream, then the second
+ * stream has a gap that can be jumped.
+ */
+ if (notes & SOME_EMPTY) {
+ unsigned long long jump_to = wreq->start + wreq->len;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active &&
+ !stream->front &&
+ stream->collected_to < jump_to) {
+ trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
+ stream->collected_to = jump_to;
+ }
+ }
+ }
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active)
+ trace_netfs_collect_stream(wreq, stream);
+ }
+
+ trace_netfs_collect_state(wreq, wreq->collected_to, notes);
+
+ /* Unlock any folios that we have now finished with. */
+ if (notes & BUFFERED) {
+ unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
+
+ if (wreq->cleaned_to < clean_to)
+ netfs_writeback_unlock_folios(wreq, clean_to, &notes);
+ } else {
+ wreq->cleaned_to = wreq->collected_to;
+ }
+
+ // TODO: Discard encryption buffers
+
+ /* If all streams are discontiguous with the last folio we cleared, we
+ * may need to skip a set of folios.
+ */
+ if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
+ unsigned long long jump_to = ULLONG_MAX;
+
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->active && stream->front &&
+ stream->front->start < jump_to)
+ jump_to = stream->front->start;
+ }
+
+ trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
+ wreq->contiguity = jump_to;
+ wreq->cleaned_to = jump_to;
+ wreq->collected_to = jump_to;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->collected_to < jump_to)
+ stream->collected_to = jump_to;
+ }
+ //cond_resched();
+ notes |= MADE_PROGRESS;
+ goto reassess_streams;
+ }
+
+ if (notes & NEED_RETRY)
+ goto need_retry;
+ if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+ clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+ }
+
+ if (notes & NEED_REASSESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+ if (notes & MADE_PROGRESS) {
+ //cond_resched();
+ goto reassess_streams;
+ }
+
+out:
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ _leave(" = %x", notes);
+ return;
+
+need_retry:
+ /* Okay... We're going to have to retry one or both streams. Note
+ * that any partially completed op will have had any wholly transferred
+ * folios removed from it.
+ */
+ _debug("retry");
+ netfs_retry_writes(wreq);
+ goto out;
+}
+
+/*
+ * Perform the collection of subrequests, folios and encryption buffers.
+ */
+void netfs_write_collection_worker(struct work_struct *work)
+{
+ struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ size_t transferred;
+ int s;
+
+ _enter("R=%x", wreq->debug_id);
+
+ netfs_see_request(wreq, netfs_rreq_trace_see_work);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+
+ netfs_collect_write_results(wreq);
+
+ /* We're done when the app thread has finished posting subreqs and all
+ * the queues in all the streams are empty.
+ */
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ smp_rmb(); /* Read ALL_QUEUED before lists. */
+
+ transferred = LONG_MAX;
+ for (s = 0; s < NR_IO_STREAMS; s++) {
+ struct netfs_io_stream *stream = &wreq->io_streams[s];
+ if (!stream->active)
+ continue;
+ if (!list_empty(&stream->subrequests)) {
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+ return;
+ }
+ if (stream->transferred < transferred)
+ transferred = stream->transferred;
+ }
+
+ /* Okay, declare that all I/O is complete. */
+ wreq->transferred = transferred;
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ if (wreq->io_streams[1].active &&
+ wreq->io_streams[1].failed) {
+ /* Cache write failure doesn't prevent writeback completion
+ * unless we're in disconnected mode.
+ */
+ ictx->ops->invalidate_cache(wreq);
+ }
+
+ if (wreq->cleanup)
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ /* mmap may have got underfoot and we may now have folios
+ * locally covering the region we just wrote. Attempt to
+ * discard the folios, but leave in place any modified locally.
+ * ->write_iter() is prevented from interfering by the DIO
+ * counter.
+ */
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += wreq->transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+ wreq->iocb = VFS_PTR_POISON;
+ }
+
+ netfs_clear_subrequests(wreq, false);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+{
+ if (!work_pending(&wreq->work)) {
+ netfs_get_request(wreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &wreq->work))
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * @_op: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the library that a contributory write I/O operation has
+ * terminated, one way or another, and that it should collect the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred or a
+ * negative error code. The library will look after reissuing I/O operations
+ * as appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ *
+ * When this is called, ownership of the subrequest is transferred back to the
+ * library, along with a ref.
+ *
+ * Note that %_op is a void* so that the function can be passed to
+ * kiocb::term_func without the need for a casting wrapper.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ if (subreq->error == -EAGAIN)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ else
+ set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
+
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ break;
+ default:
+ break;
+ }
+ trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
+ set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+ } else {
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (subreq->transferred < subreq->len)
+ set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+
+ /* If we are at the head of the queue, wake up the collector,
+ * transferring a ref to it if we were the ones to do so.
+ */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+ netfs_wake_write_collector(wreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
new file mode 100644
index 000000000000..e190043bc0da
--- /dev/null
+++ b/fs/netfs/write_issue.c
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level (buffered) writeback.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ *
+ * To support network filesystems with local caching, we manage a situation
+ * that can be envisioned like the following:
+ *
+ * +---+---+-----+-----+---+----------+
+ * Folios: | | | | | | |
+ * +---+---+-----+-----+---+----------+
+ *
+ * +------+------+ +----+----+
+ * Upload: | | |.....| | |
+ * (Stream 0) +------+------+ +----+----+
+ *
+ * +------+------+------+------+------+
+ * Cache: | | | | | |
+ * (Stream 1) +------+------+------+------+------+
+ *
+ * Where we have a sequence of folios of varying sizes that we need to overlay
+ * with multiple parallel streams of I/O requests, where the I/O requests in a
+ * stream may also be of various sizes (in cifs, for example, the sizes are
+ * negotiated with the server; in something like ceph, they may represent the
+ * sizes of storage objects).
+ *
+ * The sequence in each stream may contain gaps and noncontiguous subrequests
+ * may be glued together into single vectored write RPCs.
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+
+/*
+ * Kill all dirty folios in the event of an unrecoverable error, starting with
+ * a locked folio we've already obtained from writeback_iter().
+ */
+static void netfs_kill_dirty_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ int error = 0;
+
+ do {
+ enum netfs_folio_trace why = netfs_folio_trace_kill;
+ struct netfs_group *group = NULL;
+ struct netfs_folio *finfo = NULL;
+ void *priv;
+
+ priv = folio_detach_private(folio);
+ if (priv) {
+ finfo = __netfs_folio_info(priv);
+ if (finfo) {
+ /* Kill folio from streaming write. */
+ group = finfo->netfs_group;
+ why = netfs_folio_trace_kill_s;
+ } else {
+ group = priv;
+ if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+ /* Kill copy-to-cache folio */
+ why = netfs_folio_trace_kill_cc;
+ group = NULL;
+ } else {
+ /* Kill folio with group */
+ why = netfs_folio_trace_kill_g;
+ }
+ }
+ }
+
+ trace_netfs_folio(folio, why);
+
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
+
+ netfs_put_group(group);
+ kfree(finfo);
+
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+}
+
+/*
+ * Create a write request and set it up appropriately for the origin type.
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+ struct file *file,
+ loff_t start,
+ enum netfs_io_origin origin)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_inode *ictx;
+
+ wreq = netfs_alloc_request(mapping, file, start, 0, origin);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ _enter("R=%x", wreq->debug_id);
+
+ ictx = netfs_inode(wreq->inode);
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+
+ wreq->contiguity = wreq->start;
+ wreq->cleaned_to = wreq->start;
+ INIT_WORK(&wreq->work, netfs_write_collection_worker);
+
+ wreq->io_streams[0].stream_nr = 0;
+ wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
+ wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
+ wreq->io_streams[0].issue_write = ictx->ops->issue_write;
+ wreq->io_streams[0].collected_to = start;
+ wreq->io_streams[0].transferred = LONG_MAX;
+
+ wreq->io_streams[1].stream_nr = 1;
+ wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
+ wreq->io_streams[1].collected_to = start;
+ wreq->io_streams[1].transferred = LONG_MAX;
+ if (fscache_resources_valid(&wreq->cache_resources)) {
+ wreq->io_streams[1].avail = true;
+ wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
+ wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
+ }
+
+ return wreq;
+}
+
+/**
+ * netfs_prepare_write_failed - Note write preparation failed
+ * @subreq: The subrequest to mark
+ *
+ * Mark a subrequest to note that preparation for write failed.
+ */
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
+}
+EXPORT_SYMBOL(netfs_prepare_write_failed);
+
+/*
+ * Prepare a write subrequest. We need to allocate a new subrequest
+ * if we don't have one.
+ */
+static void netfs_prepare_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ subreq->source = stream->source;
+ subreq->start = start;
+ subreq->max_len = ULONG_MAX;
+ subreq->max_nr_segs = INT_MAX;
+ subreq->stream_nr = stream->stream_nr;
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ switch (stream->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ subreq->max_len = wreq->wsize;
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (stream->prepare_write)
+ stream->prepare_write(subreq);
+
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+ /* We add to the end of the list whilst the collector may be walking
+ * the list. The collector only goes nextwards and uses the lock to
+ * remove entries off of the front.
+ */
+ spin_lock(&wreq->lock);
+ list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+ stream->front = subreq;
+ if (!stream->active) {
+ stream->collected_to = stream->front->start;
+ /* Write list pointers before active flag */
+ smp_store_release(&stream->active, true);
+ }
+ }
+
+ spin_unlock(&wreq->lock);
+
+ stream->construct = subreq;
+}
+
+/*
+ * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
+ * operation. The operation may be asynchronous and should call
+ * netfs_write_subrequest_terminated() when complete.
+ */
+static void netfs_do_issue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+
+ _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+
+ // TODO: Use encrypted buffer
+ if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter,
+ subreq->start + subreq->transferred - wreq->start);
+ iov_iter_truncate(&subreq->io_iter,
+ subreq->len - subreq->transferred);
+ } else {
+ iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ stream->issue_write(subreq);
+}
+
+void netfs_reissue_write(struct netfs_io_stream *stream,
+ struct netfs_io_subrequest *subreq)
+{
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_do_issue_write(stream, subreq);
+}
+
+static void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+
+ if (!subreq)
+ return;
+ stream->construct = NULL;
+
+ if (subreq->start + subreq->len > wreq->start + wreq->submitted)
+ wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+ netfs_do_issue_write(stream, subreq);
+}
+
+/*
+ * Add data to the write subrequest, dispatching each as we fill it up or if it
+ * is discontiguous with the previous. We only fill one part at a time so that
+ * we can avoid overrunning the credits obtained (cifs) and try to parallelise
+ * content-crypto preparation with network writes.
+ */
+int netfs_advance_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream,
+ loff_t start, size_t len, bool to_eof)
+{
+ struct netfs_io_subrequest *subreq = stream->construct;
+ size_t part;
+
+ if (!stream->avail) {
+ _leave("no write");
+ return len;
+ }
+
+ _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+
+ if (subreq && start != subreq->start + subreq->len) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ if (!stream->construct)
+ netfs_prepare_write(wreq, stream, start);
+ subreq = stream->construct;
+
+ part = min(subreq->max_len - subreq->len, len);
+ _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ subreq->len += part;
+ subreq->nr_segs++;
+
+ if (subreq->len >= subreq->max_len ||
+ subreq->nr_segs >= subreq->max_nr_segs ||
+ to_eof) {
+ netfs_issue_write(wreq, stream);
+ subreq = NULL;
+ }
+
+ return part;
+}
+
+/*
+ * Write some of a pending folio data back to the server.
+ */
+static int netfs_write_folio(struct netfs_io_request *wreq,
+ struct writeback_control *wbc,
+ struct folio *folio)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ struct netfs_io_stream *stream;
+ struct netfs_group *fgroup; /* TODO: Use this with ceph */
+ struct netfs_folio *finfo;
+ size_t fsize = folio_size(folio), flen = fsize, foff = 0;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false, streamw = false;
+ bool debug = false;
+
+ _enter("");
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(wreq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ wreq->nr_group_rel += netfs_folio_written_back(folio);
+ netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+ wreq->nr_group_rel = 0;
+ return 0;
+ }
+
+ if (fpos + fsize > wreq->i_size)
+ wreq->i_size = i_size;
+
+ fgroup = netfs_folio_group(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ foff = finfo->dirty_offset;
+ flen = foff + finfo->dirty_len;
+ streamw = true;
+ }
+
+ if (wreq->origin == NETFS_WRITETHROUGH) {
+ to_eof = false;
+ if (flen > i_size - fpos)
+ flen = i_size - fpos;
+ } else if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ if (!streamw)
+ folio_zero_segment(folio, flen, fsize);
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+ flen -= foff;
+
+ _debug("folio %zx %zx %zx", foff, flen, fsize);
+
+ /* Deal with discontinuities in the stream of dirty pages. These can
+ * arise from a number of sources:
+ *
+ * (1) Intervening non-dirty pages from random-access writes, multiple
+ * flushers writing back different parts simultaneously and manual
+ * syncing.
+ *
+ * (2) Partially-written pages from write-streaming.
+ *
+ * (3) Pages that belong to a different write-back group (eg. Ceph
+ * snapshots).
+ *
+ * (4) Actually-clean pages that were marked for write to the cache
+ * when they were read. Note that these appear as a special
+ * write-back group.
+ */
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ netfs_issue_write(wreq, upload);
+ } else if (fgroup != wreq->group) {
+ /* We can't write this page to the server yet. */
+ kdebug("wrong group");
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ netfs_issue_write(wreq, upload);
+ netfs_issue_write(wreq, cache);
+ return 0;
+ }
+
+ if (foff > 0)
+ netfs_issue_write(wreq, upload);
+ if (streamw)
+ netfs_issue_write(wreq, cache);
+
+ /* Flip the page to the writeback state and unlock. If we're called
+ * from write-through, then the page has already been put into the wb
+ * state.
+ */
+ if (wreq->origin == NETFS_WRITEBACK)
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+
+ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+ if (!fscache_resources_valid(&wreq->cache_resources)) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+ netfs_issue_write(wreq, upload);
+ netfs_folio_written_back(folio);
+ return 0;
+ }
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+ } else if (!upload->construct) {
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+ }
+
+ /* Move the submission point forward to allow for write-streaming data
+ * not starting at the front of the page. We don't do write-streaming
+ * with the cache as the cache requires DIO alignment.
+ *
+ * Also skip uploading for data that's been read and just needs copying
+ * to the cache.
+ */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ stream->submit_max_len = fsize;
+ stream->submit_off = foff;
+ stream->submit_len = flen;
+ if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+ (stream->source == NETFS_UPLOAD_TO_SERVER &&
+ fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
+ stream->submit_off = UINT_MAX;
+ stream->submit_len = 0;
+ stream->submit_max_len = 0;
+ }
+ }
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ for (;;) {
+ ssize_t part;
+ size_t lowest_off = ULONG_MAX;
+ int choose_s = -1;
+
+ /* Always add to the lowest-submitted stream first. */
+ for (int s = 0; s < NR_IO_STREAMS; s++) {
+ stream = &wreq->io_streams[s];
+ if (stream->submit_len > 0 &&
+ stream->submit_off < lowest_off) {
+ lowest_off = stream->submit_off;
+ choose_s = s;
+ }
+ }
+
+ if (choose_s < 0)
+ break;
+ stream = &wreq->io_streams[choose_s];
+
+ part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+ stream->submit_len, to_eof);
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_off += part;
+ stream->submit_max_len -= part;
+ if (part > stream->submit_len)
+ stream->submit_len = 0;
+ else
+ stream->submit_len -= part;
+ if (part > 0)
+ debug = true;
+ }
+
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (!debug)
+ kdebug("R=%x: No submit", wreq->debug_id);
+
+ if (flen < fsize)
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_inode *ictx = netfs_inode(mapping->host);
+ struct netfs_io_request *wreq = NULL;
+ struct folio *folio;
+ int error = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ mutex_lock(&ictx->wb_lock);
+ else if (!mutex_trylock(&ictx->wb_lock))
+ return 0;
+
+ /* Need the first folio to be able to set up the op. */
+ folio = writeback_iter(mapping, wbc, NULL, &error);
+ if (!folio)
+ goto out;
+
+ wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ error = PTR_ERR(wreq);
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_writeback);
+ netfs_stat(&netfs_n_wh_writepages);
+
+ do {
+ _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+
+ /* It appears we don't have to handle cyclic writeback wrapping. */
+ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
+
+ if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
+ unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
+ set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ wreq->netfs_ops->begin_writeback(wreq);
+ }
+
+ error = netfs_write_folio(wreq, wbc, folio);
+ if (error < 0)
+ break;
+ } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+
+ for (int s = 0; s < NR_IO_STREAMS; s++)
+ netfs_issue_write(wreq, &wreq->io_streams[s]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", error);
+ return error;
+
+couldnt_start:
+ netfs_kill_dirty_pages(mapping, wbc, folio);
+out:
+ mutex_unlock(&ictx->wb_lock);
+ _leave(" = %d", error);
+ return error;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
+
+ mutex_lock(&ictx->wb_lock);
+
+ wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq)) {
+ mutex_unlock(&ictx->wb_lock);
+ return wreq;
+ }
+
+ wreq->io_streams[0].avail = true;
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+ return wreq;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *folio, size_t copied, bool to_page_end,
+ struct folio **writethrough_cache)
+{
+ _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
+
+ if (!*writethrough_cache) {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+
+ /* We can make multiple writes to the folio... */
+ folio_start_writeback(folio);
+ if (wreq->len == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ *writethrough_cache = folio;
+ }
+
+ wreq->len += copied;
+ if (!to_page_end)
+ return 0;
+
+ *writethrough_cache = NULL;
+ return netfs_write_folio(wreq, wbc, folio);
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ int ret;
+
+ _enter("R=%x", wreq->debug_id);
+
+ if (writethrough_cache)
+ netfs_write_folio(wreq, wbc, writethrough_cache);
+
+ netfs_issue_write(wreq, &wreq->io_streams[0]);
+ netfs_issue_write(wreq, &wreq->io_streams[1]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ mutex_unlock(&ictx->wb_lock);
+
+ ret = wreq->error;
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.
+ */
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
+{
+ struct netfs_io_stream *upload = &wreq->io_streams[0];
+ ssize_t part;
+ loff_t start = wreq->start;
+ int error = 0;
+
+ _enter("%zx", len);
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ while (len) {
+ // TODO: Prepare content encryption
+
+ _debug("unbuffered %zx", len);
+ part = netfs_advance_write(wreq, upload, start, len, false);
+ start += part;
+ len -= part;
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
+ }
+ if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
+ break;
+ }
+
+ netfs_issue_write(wreq, upload);
+
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+ if (list_empty(&upload->subrequests))
+ netfs_wake_write_collector(wreq, false);
+
+ _leave(" = %d", error);
+ return error;
+}