summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-01-26 01:00:54 +0300
committerJakub Kicinski <kuba@kernel.org>2024-01-26 01:20:08 +0300
commit06f609b3119876b42f19fdb690b10896d3c648e6 (patch)
tree2f42b892152af80d299133ef6c902261a1fd0b98 /fs
parent91374ba537bd60caa9ae052c9f1c0fe055b39149 (diff)
parentecb1b8288dc7ccbdcb3b9df005fa1c0e0c0388a7 (diff)
downloadlinux-06f609b3119876b42f19fdb690b10896d3c648e6.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. No conflicts or adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_addr.c353
-rw-r--r--fs/9p/vfs_file.c89
-rw-r--r--fs/9p/vfs_inode.c16
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/dir.c30
-rw-r--r--fs/afs/dynroot.c11
-rw-r--r--fs/afs/file.c213
-rw-r--r--fs/afs/inode.c28
-rw-r--r--fs/afs/internal.h72
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c826
-rw-r--r--fs/bcachefs/Makefile2
-rw-r--r--fs/bcachefs/alloc_background.c89
-rw-r--r--fs/bcachefs/alloc_background_format.h92
-rw-r--r--fs/bcachefs/alloc_foreground.c7
-rw-r--r--fs/bcachefs/backpointers.c100
-rw-r--r--fs/bcachefs/backpointers.h1
-rw-r--r--fs/bcachefs/bcachefs.h5
-rw-r--r--fs/bcachefs/bcachefs_format.h888
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c9
-rw-r--r--fs/bcachefs/bkey_methods.h10
-rw-r--r--fs/bcachefs/bset.c7
-rw-r--r--fs/bcachefs/bset.h3
-rw-r--r--fs/bcachefs/btree_cache.c12
-rw-r--r--fs/bcachefs/btree_cache.h19
-rw-r--r--fs/bcachefs/btree_gc.c36
-rw-r--r--fs/bcachefs/btree_io.c38
-rw-r--r--fs/bcachefs/btree_iter.c2
-rw-r--r--fs/bcachefs/btree_iter.h5
-rw-r--r--fs/bcachefs/btree_locking.c40
-rw-r--r--fs/bcachefs/btree_locking.h9
-rw-r--r--fs/bcachefs/btree_trans_commit.c35
-rw-r--r--fs/bcachefs/btree_types.h12
-rw-r--r--fs/bcachefs/btree_update_interior.c8
-rw-r--r--fs/bcachefs/btree_update_interior.h42
-rw-r--r--fs/bcachefs/btree_write_buffer.c7
-rw-r--r--fs/bcachefs/buckets.c148
-rw-r--r--fs/bcachefs/buckets.h17
-rw-r--r--fs/bcachefs/buckets_types.h15
-rw-r--r--fs/bcachefs/clock.c4
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c6
-rw-r--r--fs/bcachefs/debug.c16
-rw-r--r--fs/bcachefs/dirent_format.h42
-rw-r--r--fs/bcachefs/ec.c6
-rw-r--r--fs/bcachefs/ec_format.h19
-rw-r--r--fs/bcachefs/extents.c11
-rw-r--r--fs/bcachefs/extents.h2
-rw-r--r--fs/bcachefs/extents_format.h295
-rw-r--r--fs/bcachefs/eytzinger.h4
-rw-r--r--fs/bcachefs/fs-io-direct.c4
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-io.c7
-rw-r--r--fs/bcachefs/fs-ioctl.c11
-rw-r--r--fs/bcachefs/inode.c29
-rw-r--r--fs/bcachefs/inode_format.h166
-rw-r--r--fs/bcachefs/io_misc.c4
-rw-r--r--fs/bcachefs/io_write.c13
-rw-r--r--fs/bcachefs/journal.c111
-rw-r--r--fs/bcachefs/journal_io.c5
-rw-r--r--fs/bcachefs/logged_ops_format.h30
-rw-r--r--fs/bcachefs/move.c65
-rw-r--r--fs/bcachefs/opts.c4
-rw-r--r--fs/bcachefs/opts.h9
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/rebalance.c13
-rw-r--r--fs/bcachefs/recovery.c2
-rw-r--r--fs/bcachefs/reflink.c21
-rw-r--r--fs/bcachefs/reflink.h8
-rw-r--r--fs/bcachefs/reflink_format.h33
-rw-r--r--fs/bcachefs/replicas.c28
-rw-r--r--fs/bcachefs/sb-clean.c2
-rw-r--r--fs/bcachefs/sb-counters.c (renamed from fs/bcachefs/counters.c)2
-rw-r--r--fs/bcachefs/sb-counters.h (renamed from fs/bcachefs/counters.h)7
-rw-r--r--fs/bcachefs/sb-counters_format.h98
-rw-r--r--fs/bcachefs/sb-members.c4
-rw-r--r--fs/bcachefs/snapshot.c4
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/super-io.c6
-rw-r--r--fs/bcachefs/super.c10
-rw-r--r--fs/bcachefs/sysfs.c15
-rw-r--r--fs/bcachefs/trace.h76
-rw-r--r--fs/bcachefs/util.c15
-rw-r--r--fs/bcachefs/util.h3
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/bcachefs/xattr_format.h19
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/extent-tree.c53
-rw-r--r--fs/btrfs/inode.c22
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/lzo.c34
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/scrub.c36
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/subpage.c3
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c73
-rw-r--r--fs/btrfs/zoned.c8
-rw-r--r--fs/cachefiles/Kconfig2
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/io.c34
-rw-r--r--fs/cachefiles/ondemand.c5
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c33
-rw-r--r--fs/ceph/cache.h45
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/dir.c21
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c35
-rw-r--r--fs/ceph/quota.c39
-rw-r--r--fs/ceph/super.h14
-rw-r--r--fs/erofs/Kconfig7
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/fscache.c6
-rw-r--r--fs/erofs/zmap.c23
-rw-r--r--fs/exec.c106
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fscache/Kconfig40
-rw-r--r--fs/fscache/Makefile16
-rw-r--r--fs/fscache/internal.h277
-rw-r--r--fs/netfs/Kconfig39
-rw-r--r--fs/netfs/Makefile22
-rw-r--r--fs/netfs/buffered_read.c237
-rw-r--r--fs/netfs/buffered_write.c1254
-rw-r--r--fs/netfs/direct_read.c125
-rw-r--r--fs/netfs/direct_write.c171
-rw-r--r--fs/netfs/fscache_cache.c (renamed from fs/fscache/cache.c)3
-rw-r--r--fs/netfs/fscache_cookie.c (renamed from fs/fscache/cookie.c)0
-rw-r--r--fs/netfs/fscache_internal.h14
-rw-r--r--fs/netfs/fscache_io.c (renamed from fs/fscache/io.c)42
-rw-r--r--fs/netfs/fscache_main.c (renamed from fs/fscache/main.c)25
-rw-r--r--fs/netfs/fscache_proc.c (renamed from fs/fscache/proc.c)23
-rw-r--r--fs/netfs/fscache_stats.c (renamed from fs/fscache/stats.c)13
-rw-r--r--fs/netfs/fscache_volume.c (renamed from fs/fscache/volume.c)0
-rw-r--r--fs/netfs/internal.h284
-rw-r--r--fs/netfs/io.c215
-rw-r--r--fs/netfs/iterator.c97
-rw-r--r--fs/netfs/locking.c216
-rw-r--r--fs/netfs/main.c109
-rw-r--r--fs/netfs/misc.c260
-rw-r--r--fs/netfs/objects.c59
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c42
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h2
-rw-r--r--fs/nfsd/nfs4state.c26
-rw-r--r--fs/overlayfs/namei.c43
-rw-r--r--fs/overlayfs/overlayfs.h23
-rw-r--r--fs/overlayfs/ovl_entry.h4
-rw-r--r--fs/overlayfs/readdir.c7
-rw-r--r--fs/overlayfs/super.c15
-rw-r--r--fs/overlayfs/util.c53
-rw-r--r--fs/smb/client/cached_dir.c2
-rw-r--r--fs/smb/client/cifs_debug.c6
-rw-r--r--fs/smb/client/cifsfs.c11
-rw-r--r--fs/smb/client/cifsglob.h4
-rw-r--r--fs/smb/client/connect.c4
-rw-r--r--fs/smb/client/file.c28
-rw-r--r--fs/smb/client/fs_context.c6
-rw-r--r--fs/smb/client/fs_context.h2
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/smb/client/inode.c29
-rw-r--r--fs/smb/client/misc.c1
-rw-r--r--fs/smb/client/readdir.c12
-rw-r--r--fs/smb/client/smb2inode.c234
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c10
-rw-r--r--fs/smb/client/smb2pdu.c127
-rw-r--r--fs/smb/client/smb2proto.h4
-rw-r--r--fs/smb/client/smb2status.h2
-rw-r--r--fs/smb/server/asn1.c5
-rw-r--r--fs/smb/server/connection.c6
-rw-r--r--fs/smb/server/connection.h2
-rw-r--r--fs/smb/server/oplock.c6
-rw-r--r--fs/smb/server/smb2pdu.c22
-rw-r--r--fs/smb/server/transport_rdma.c11
-rw-r--r--fs/smb/server/transport_tcp.c13
-rw-r--r--fs/tracefs/event_inode.c14
-rw-r--r--fs/tracefs/internal.h7
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
194 files changed, 6181 insertions, 4060 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 731e3d14b67d..0e8418066a48 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
dev_t rdev);
+void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8a635999a7d6..047855033d32 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -19,12 +19,45 @@
#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
+#include <trace/events/netfs.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"
#include "fid.h"
+static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+{
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
+}
+
+static void v9fs_upload_to_server_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ v9fs_upload_to_server(subreq);
+}
+
+/*
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
+ */
+static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, v9fs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
+}
+
/**
* v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
@@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
- struct iov_iter to;
- loff_t pos = subreq->start + subreq->transferred;
- size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
-
- total = p9_client_read(fid, pos, &to, &err);
+ total = p9_client_read(fid, subreq->start + subreq->transferred,
+ &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
@@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
}
/**
- * v9fs_init_request - Initialise a read request
+ * v9fs_init_request - Initialise a request
* @rreq: The read request
* @file: The file being read from
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct p9_fid *fid = file->private_data;
-
- BUG_ON(!fid);
+ struct p9_fid *fid;
+ bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
+ rreq->origin == NETFS_WRITEBACK ||
+ rreq->origin == NETFS_WRITETHROUGH ||
+ rreq->origin == NETFS_LAUNDER_WRITE ||
+ rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ rreq->origin == NETFS_DIO_WRITE);
+
+ if (file) {
+ fid = file->private_data;
+ if (!fid)
+ goto no_fid;
+ p9_fid_get(fid);
+ } else {
+ fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
+ if (!fid)
+ goto no_fid;
+ }
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
- !(fid->mode & P9_ORDWR));
-
- p9_fid_get(fid);
+ WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
rreq->netfs_priv = fid;
return 0;
+
+no_fid:
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ rreq->inode->i_ino);
+ return -EINVAL;
}
/**
@@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
p9_fid_put(fid);
}
-/**
- * v9fs_begin_cache_operation - Begin a cache operation for a read
- * @rreq: The read request
- */
-static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_9P_FSCACHE
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-#else
- return -ENOBUFS;
-#endif
-}
-
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
- .begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
+ .create_write_requests = v9fs_create_write_requests,
};
-/**
- * v9fs_release_folio - release the private state associated with a folio
- * @folio: The folio to be released
- * @gfp: The caller's allocation restrictions
- *
- * Returns true if the page can be released, false otherwise.
- */
-
-static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return false;
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
-#endif
- return true;
-}
-
-static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-#ifdef CONFIG_9P_FSCACHE
-static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct v9fs_inode *v9inode = priv;
- __le32 version;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS) {
- version = cpu_to_le32(v9inode->qid.version);
- fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->netfs.inode), 0);
- }
-}
-#endif
-
-static int v9fs_vfs_write_folio_locked(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- loff_t start = folio_pos(folio);
- loff_t i_size = i_size_read(inode);
- struct iov_iter from;
- size_t len = folio_size(folio);
- struct p9_fid *writeback_fid;
- int err;
- struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
- struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
-
- if (start >= i_size)
- return 0; /* Simultaneous truncation occurred */
-
- len = min_t(loff_t, i_size - start, len);
-
- iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
-
- writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
- if (!writeback_fid) {
- WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
- inode->i_private);
- return -EINVAL;
- }
-
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
-
- p9_client_write(writeback_fid, start, &from, &err);
-
-#ifdef CONFIG_9P_FSCACHE
- if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
- folio_start_fscache(folio);
- fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
- }
-#endif
-
- folio_end_writeback(folio);
- p9_fid_put(writeback_fid);
-
- return err;
-}
-
-static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int retval;
-
- p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
-
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval < 0) {
- if (retval == -EAGAIN) {
- folio_redirty_for_writepage(wbc, folio);
- retval = 0;
- } else {
- mapping_set_error(folio_mapping(folio), retval);
- }
- } else
- retval = 0;
-
- folio_unlock(folio);
- return retval;
-}
-
-static int v9fs_launder_folio(struct folio *folio)
-{
- int retval;
-
- if (folio_clear_dirty_for_io(folio)) {
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval)
- return retval;
- }
- folio_wait_fscache(folio);
- return 0;
-}
-
-/**
- * v9fs_direct_IO - 9P address space operation for direct I/O
- * @iocb: target I/O control block
- * @iter: The data/buffer to use
- *
- * The presence of v9fs_direct_IO() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode, we shunt off direct read and write requests before
- * the VFS gets them, so this method should never be called.
- *
- * Direct IO is not 'yet' supported in the cached mode. Hence when
- * this routine is called through generic_file_aio_read(), the read/write fails
- * with an error.
- *
- */
-static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t n;
- int err = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
- if (n) {
- struct inode *inode = file_inode(file);
- loff_t i_size = i_size_read(inode);
-
- if (pos + n > i_size)
- inode_add_bytes(inode, pos + n - i_size);
- }
- } else {
- n = p9_client_read(file->private_data, pos, iter, &err);
- }
- return n ? n : err;
-}
-
-static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct page **subpagep, void **fsdata)
-{
- int retval;
- struct folio *folio;
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
- if (retval < 0)
- return retval;
-
- *subpagep = &folio->page;
- return retval;
-}
-
-static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct page *subpage, void *fsdata)
-{
- loff_t last_pos = pos + copied;
- struct folio *folio = page_folio(subpage);
- struct inode *inode = mapping->host;
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- if (!folio_test_uptodate(folio)) {
- if (unlikely(copied < len)) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
- */
- if (last_pos > inode->i_size) {
- inode_add_bytes(inode, last_pos - inode->i_size);
- i_size_write(inode, last_pos);
-#ifdef CONFIG_9P_FSCACHE
- fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
- &last_pos);
-#endif
- }
- folio_mark_dirty(folio);
-out:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
-#ifdef CONFIG_9P_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
-}
-#else
-#define v9fs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations v9fs_addr_operations = {
- .read_folio = netfs_read_folio,
- .readahead = netfs_readahead,
- .dirty_folio = v9fs_dirty_folio,
- .writepage = v9fs_vfs_writepage,
- .write_begin = v9fs_write_begin,
- .write_end = v9fs_write_end,
- .release_folio = v9fs_release_folio,
- .invalidate_folio = v9fs_invalidate_folio,
- .launder_folio = v9fs_launder_folio,
- .direct_IO = v9fs_direct_IO,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .launder_folio = netfs_launder_folio,
+ .direct_IO = noop_direct_IO,
+ .writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..bae330c2f0cf 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -353,25 +353,15 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
fid->fid, iov_iter_count(to), iocb->ki_pos);
- if (!(fid->mode & P9L_DIRECT)) {
- p9_debug(P9_DEBUG_VFS, "(cached)\n");
- return generic_file_read_iter(iocb, to);
- }
-
- if (iocb->ki_filp->f_flags & O_NONBLOCK)
- ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
- else
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
- if (!ret)
- return err;
+ if (fid->mode & P9L_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
- iocb->ki_pos += ret;
- return ret;
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return netfs_file_read_iter(iocb, to);
}
/*
@@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct p9_fid *fid = file->private_data;
- ssize_t retval;
- loff_t origin;
- int err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
- if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
- p9_debug(P9_DEBUG_CACHE, "(cached)\n");
- return generic_file_write_iter(iocb, from);
- }
+ if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))
+ return netfs_unbuffered_write_iter(iocb, from);
- retval = generic_write_checks(iocb, from);
- if (retval <= 0)
- return retval;
-
- origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
- if (retval > 0) {
- struct inode *inode = file_inode(file);
- loff_t i_size;
- unsigned long pg_start, pg_end;
-
- pg_start = origin >> PAGE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- iocb->ki_pos += retval;
- i_size = i_size_read(inode);
- if (iocb->ki_pos > i_size) {
- inode_add_bytes(inode, iocb->ki_pos - i_size);
- /*
- * Need to serialize against i_size_write() in
- * v9fs_stat2inode()
- */
- v9fs_i_size_write(inode, iocb->ki_pos);
- }
- return retval;
- }
- return err;
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return netfs_file_write_iter(iocb, from);
}
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
@@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
- struct file *filp = vmf->vma->vm_file;
- struct inode *inode = file_inode(filp);
-
-
- p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
- folio, (unsigned long)filp->private_data);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_NOPAGE;
-#endif
-
- /* Update file times before taking page lock */
- file_update_time(filp);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- if (folio_mapping(folio) != inode->i_mapping)
- goto out_unlock;
- folio_wait_stable(folio);
-
- return VM_FAULT_LOCKED;
-out_unlock:
- folio_unlock(folio);
- return VM_FAULT_NOPAGE;
+ return netfs_page_mkwrite(vmf, NULL);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b845ee18a80b..32572982f72e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode)
/*
* Set parameters for the netfs library
*/
-static void v9fs_set_netfs_context(struct inode *inode)
+void v9fs_set_netfs_context(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
-
- v9fs_set_netfs_context(inode);
error:
return err;
@@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
iput(inode);
return ERR_PTR(err);
}
+ v9fs_set_netfs_context(inode);
return inode;
}
@@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
-#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
- fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
- &version);
-#endif
+ netfs_clear_inode_writeback(inode, &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
goto error;
v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE) {
@@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->length;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c7319af2f471..3505227e1704 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE)
@@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->st_size;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
@@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mode = mode;
}
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE)
+ stat->st_result_mask & P9_STATS_SIZE) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 73db55c050bf..941f7d0e0bfa 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode)
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
-
/*
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-
- v9inode = V9FS_I(inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int v9fs_write_inode_dotl(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
- v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index a3159831ba98..89fdbefd1075 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
source "fs/netfs/Kconfig"
-source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index a6962c588962..c09016257f05 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_NETFS_SUPPORT) += netfs/
-obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index c14533ef108f..b5b8de521f99 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
BUG_ON(xa_is_value(folio));
- ASSERTCMP(folio_file_mapping(folio), ==, mapping);
+ ASSERTCMP(folio->mapping, ==, mapping);
folio_put(folio);
}
@@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
for (offset = 0; offset < size; offset += sizeof(*block)) {
block = kmap_local_folio(folio, offset);
- pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
kunmap_local(block);
}
}
@@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
+ /* Don't expose silly rename entries to userspace. */
+ if (nlen > 6 &&
+ dire->u.name[0] == '.' &&
+ ctx->actor != afs_lookup_filldir &&
+ ctx->actor != afs_lookup_one_filldir &&
+ memcmp(dire->u.name, ".__afs", 6) == 0)
+ continue;
+
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
@@ -708,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
break;
}
+ if (vp->scb.status.abort_code)
+ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
if (!vp->scb.have_status && !vp->scb.have_error)
continue;
@@ -897,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(afs_op_error(op));
out_op:
if (!afs_op_error(op)) {
- inode = &op->file[1].vnode->netfs.inode;
- op->file[1].vnode = NULL;
+ if (op->file[1].scb.status.abort_code) {
+ afs_op_accumulate_error(op, -ECONNABORTED,
+ op->file[1].scb.status.abort_code);
+ } else {
+ inode = &op->file[1].vnode->netfs.inode;
+ op->file[1].vnode = NULL;
+ }
}
if (op->file[0].scb.have_status)
@@ -2022,7 +2036,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
folio_detach_private(folio);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 2cd40ba601f1..c4d2711e20ad 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_inode_init(&vnode->netfs, NULL);
+ netfs_inode_init(&vnode->netfs, NULL, false);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
@@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-/*
- * Dirs in the dynamic root don't need revalidation.
- */
-static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 1;
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_revalidate = afs_dynroot_d_revalidate,
.d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 30914e0d9cb2..3d33b221d9ca 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,9 +20,6 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length);
-static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = {
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
- .write_iter = afs_file_write,
+ .write_iter = netfs_file_write_iter,
.mmap = afs_file_mmap,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
+ .direct_IO = noop_direct_IO,
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .dirty_folio = afs_dirty_folio,
- .launder_folio = afs_launder_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
- .write_begin = afs_write_begin,
- .write_end = afs_write_end,
- .writepages = afs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .launder_folio = netfs_launder_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
+ .writepages = afs_writepages,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
};
@@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->len = subreq->len - subreq->transferred;
fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
- fsreq->iter = &fsreq->def_iter;
-
- iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
- &fsreq->vnode->netfs.inode.i_mapping->i_pages,
- fsreq->pos, fsreq->len);
+ fsreq->iter = &subreq->io_iter;
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
@@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- rreq->netfs_priv = key_get(afs_file_key(file));
+ if (file)
+ rreq->netfs_priv = key_get(afs_file_key(file));
+ rreq->rsize = 256 * 1024;
+ rreq->wsize = 256 * 1024;
return 0;
}
-static int afs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
-
- return fscache_begin_read_operation(&rreq->cache_resources,
- afs_vnode_cache(vnode));
-#else
- return -ENOBUFS;
-#endif
-}
-
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
struct folio **foliop, void **_fsdata)
{
@@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq)
key_put(rreq->netfs_priv);
}
-const struct netfs_request_ops afs_req_ops = {
- .init_request = afs_init_request,
- .free_request = afs_free_request,
- .begin_cache_operation = afs_begin_cache_operation,
- .check_write_begin = afs_check_write_begin,
- .issue_read = afs_issue_read,
-};
-
-int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
+static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
{
- fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
- return 0;
-}
-
-/*
- * Adjust the dirty region of the page on truncation or full invalidation,
- * getting rid of the markers altogether if the region is entirely invalidated.
- */
-static void afs_invalidate_dirty(struct folio *folio, size_t offset,
- size_t length)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- unsigned long priv;
- unsigned int f, t, end = offset + length;
-
- priv = (unsigned long)folio_get_private(folio);
-
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == folio_size(folio))
- goto full_invalidate;
-
- /* If the page was dirtied by page_mkwrite(), the PTE stays writable
- * and we don't get another notification to tell us to expand it
- * again.
- */
- if (afs_is_folio_dirty_mmapped(priv))
- return;
-
- /* We may need to shorten the dirty region */
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
-
- if (t <= offset || f >= end)
- return; /* Doesn't overlap */
-
- if (f < offset && t > end)
- return; /* Splits the dirty region - just absorb it */
-
- if (f >= offset && t <= end)
- goto undirty;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ loff_t i_size;
- if (f < offset)
- t = offset;
- else
- f = end;
- if (f == t)
- goto undirty;
-
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
- return;
-
-undirty:
- trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
- folio_clear_dirty_for_io(folio);
-full_invalidate:
- trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
- folio_detach_private(folio);
+ write_seqlock(&vnode->cb_lock);
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (new_i_size > i_size) {
+ i_size_write(&vnode->netfs.inode, new_i_size);
+ inode_set_bytes(&vnode->netfs.inode, new_i_size);
+ }
+ write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size);
}
-/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
- */
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
+static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq)
{
- _enter("{%lu},%zu,%zu", folio->index, offset, length);
-
- BUG_ON(!folio_test_locked(folio));
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
- if (folio_get_private(folio))
- afs_invalidate_dirty(folio, offset, length);
-
- folio_wait_fscache(folio);
- _leave("");
+ afs_invalidate_cache(vnode, 0);
}
-/*
- * release a page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
- */
-static bool afs_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
-
- _enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
- gfp);
-
- /* deny if folio is being written to the cache and the caller hasn't
- * elected to wait */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(afs_vnode_cache(vnode));
-#endif
-
- if (folio_test_private(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
- folio_detach_private(folio);
- }
-
- /* Indicate that the folio can be released */
- _leave(" = T");
- return true;
-}
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
+ .free_request = afs_free_request,
+ .check_write_begin = afs_check_write_begin,
+ .issue_read = afs_issue_read,
+ .update_i_size = afs_update_i_size,
+ .invalidate_cache = afs_netfs_invalidate_cache,
+ .create_write_requests = afs_create_write_requests,
+};
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
@@ -576,28 +468,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = iocb->ki_filp->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return generic_file_read_iter(iocb, iter);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ return ret;
}
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(in));
+ struct inode *inode = file_inode(in);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = in->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return filemap_splice_read(in, ppos, pipe, len, flags);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ netfs_end_io_read(inode);
+ return ret;
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4f04f6f33f46..94fc049aff58 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_inode_init(&vnode->netfs, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops, true);
}
/*
@@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op,
struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
+ bool unexpected_jump = false;
bool data_changed = false;
bool change_size = vp->set_size;
@@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op,
}
change_size = true;
data_changed = true;
+ unexpected_jump = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op,
* what's on the server.
*/
vnode->netfs.remote_i_size = status->size;
- if (change_size) {
+ if (change_size || status->size > i_size_read(inode)) {
afs_set_i_size(vnode, status->size);
+ if (unexpected_jump)
+ vnode->netfs.zero_point = status->size;
inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t);
}
@@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
- fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
+ netfs_clear_inode_writeback(inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->netfs.inode;
+ struct afs_vnode *vnode = vp->vnode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
loff_t i_size = op->setattr.old_i_size;
- if (size < i_size)
- truncate_pagecache(inode, size);
- if (size != i_size)
- fscache_resize_cookie(afs_vnode_cache(vp->vnode),
- vp->scb.status.size);
+ if (size != i_size) {
+ truncate_setsize(&vnode->netfs.inode, size);
+ netfs_resize_file(&vnode->netfs, size, true);
+ fscache_resize_cookie(afs_vnode_cache(vnode), size);
+ }
}
}
@@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
attr->ia_size < i_size &&
- attr->ia_size > vnode->status.size) {
- truncate_pagecache(inode, attr->ia_size);
+ attr->ia_size > vnode->netfs.remote_i_size) {
+ truncate_setsize(inode, attr->ia_size);
+ netfs_resize_file(&vnode->netfs, size, false);
fscache_resize_cookie(afs_vnode_cache(vnode),
attr->ia_size);
- i_size_write(inode, attr->ia_size);
ret = 0;
goto out_unlock;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 55aa0679d8ce..9c03fcf7ffaa 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -985,62 +985,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
i_size_read(&vnode->netfs.inode), flags);
}
-/*
- * We use folio->private to hold the amount of the folio that we've written to,
- * splitting the field into two parts. However, we need to represent a range
- * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
- * exceeds what we can encode.
- */
-#ifdef CONFIG_64BIT
-#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
-#define __AFS_FOLIO_PRIV_SHIFT 32
-#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
-#else
-#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
-#define __AFS_FOLIO_PRIV_SHIFT 16
-#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
-#endif
-
-static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
-{
- int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
- return (shift > 0) ? shift : 0;
-}
-
-static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
-{
- unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
-
- /* The lower bound is inclusive */
- return x << afs_folio_dirty_resolution(folio);
-}
-
-static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
-{
- unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
-
- /* The upper bound is immediately beyond the region */
- return (x + 1) << afs_folio_dirty_resolution(folio);
-}
-
-static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
-{
- unsigned int res = afs_folio_dirty_resolution(folio);
- from >>= res;
- to = (to - 1) >> res;
- return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
-}
-
-static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
-{
- return priv | __AFS_FOLIO_PRIV_MMAPPED;
-}
-
-static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
-{
- return priv & __AFS_FOLIO_PRIV_MMAPPED;
-}
-
#include <trace/events/afs.h>
/*****************************************************************************/
@@ -1167,7 +1111,6 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
-extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1658,24 +1601,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-#ifdef CONFIG_AFS_FSCACHE
-bool afs_dirty_folio(struct address_space *, struct folio *);
-#else
-#define afs_dirty_folio filemap_dirty_folio
-#endif
-extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
-extern int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
-extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-int afs_launder_folio(struct folio *);
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 3bd02571f30d..15eab053af6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
if (!preflist) {
seq_puts(m, "NO PREFS\n");
- return 0;
+ goto out;
}
seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
@@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
}
}
- rcu_read_lock();
+out:
+ rcu_read_unlock();
return 0;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index ae2d66a52add..f3ba1c3e72f5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,7 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
- .write_inode = afs_write_inode,
+ .write_inode = netfs_unpin_writeback,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 61d34ad2ca7d..74402d95a884 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,309 +12,17 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop);
-
-static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
- loff_t i_size, bool caching);
-
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- afs_vnode_cache(AFS_FS_I(mapping->host)));
-}
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
-}
-#endif
-
-/*
- * Flush out a conflicting write. This may extend the write to the surrounding
- * pages if also dirty and contiguous to the conflicting region..
- */
-static int afs_flush_conflicting_write(struct address_space *mapping,
- struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = folio_pos(folio),
- .range_end = LLONG_MAX,
- };
- loff_t next;
-
- return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
- &next, true);
-}
-
-/*
- * prepare to perform part of a write to a page
- */
-int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **_page, void **fsdata)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct folio *folio;
- unsigned long priv;
- unsigned f, from;
- unsigned t, to;
- pgoff_t index;
- int ret;
-
- _enter("{%llx:%llu},%llx,%x",
- vnode->fid.vid, vnode->fid.vnode, pos, len);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
- if (ret < 0)
- return ret;
-
- index = folio_index(folio);
- from = pos - index * PAGE_SIZE;
- to = from + len;
-
-try_again:
- /* See if this page is already partially written in a way that we can
- * merge the new write with.
- */
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- ASSERTCMP(f, <=, t);
-
- if (folio_test_writeback(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- folio_unlock(folio);
- goto wait_for_writeback;
- }
- /* If the file is being filled locally, allow inter-write
- * spaces to be merged into writes. If it's not, only write
- * back what the user gives us.
- */
- if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
- (to < f || from > t))
- goto flush_conflicting_write;
- }
-
- *_page = folio_file_page(folio, pos / PAGE_SIZE);
- _leave(" = 0");
- return 0;
-
- /* The previous write and this write aren't adjacent or overlapping, so
- * flush the page out.
- */
-flush_conflicting_write:
- trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
- folio_unlock(folio);
-
- ret = afs_flush_conflicting_write(mapping, folio);
- if (ret < 0)
- goto error;
-
-wait_for_writeback:
- ret = folio_wait_writeback_killable(folio);
- if (ret < 0)
- goto error;
-
- ret = folio_lock_killable(folio);
- if (ret < 0)
- goto error;
- goto try_again;
-
-error:
- folio_put(folio);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * finalise part of a write to a page
- */
-int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *subpage, void *fsdata)
-{
- struct folio *folio = page_folio(subpage);
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- unsigned long priv;
- unsigned int f, from = offset_in_folio(folio, pos);
- unsigned int t, to = from + copied;
- loff_t i_size, write_end_pos;
-
- _enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- if (!folio_test_uptodate(folio)) {
- if (copied < len) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- if (copied == 0)
- goto out;
-
- write_end_pos = pos + copied;
-
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size) {
- write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size)
- afs_set_i_size(vnode, write_end_pos);
- write_sequnlock(&vnode->cb_lock);
- fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
- }
-
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (from < f)
- f = from;
- if (to > t)
- t = to;
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
- } else {
- priv = afs_folio_dirty(folio, from, to);
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
- }
-
- if (folio_mark_dirty(folio))
- _debug("dirtied %lx", folio_index(folio));
-
-out:
- folio_unlock(folio);
- folio_put(folio);
- return copied;
-}
-
-/*
- * kill all the pages in the given range
- */
-static void afs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- folio_clear_uptodate(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void afs_redirty_pages(struct writeback_control *wbc,
- struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = index + folio_nr_pages(folio);
- folio_redirty_for_writepage(wbc, folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- _leave("");
-}
-
/*
* completion of write to server
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->netfs.inode.i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
-
_enter("{%llx:%llu},{%x @%llx}",
vnode->fid.vid, vnode->fid.vnode, len, start);
- rcu_read_lock();
-
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- kdebug("bad %x @%llx page %lx %lx",
- len, start, folio_index(folio), end);
- ASSERT(folio_test_writeback(folio));
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
- folio_detach_private(folio);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
-
afs_prune_wb_keys(vnode);
_leave("");
}
@@ -451,363 +159,53 @@ try_next_key:
return afs_put_operation(op);
}
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void afs_extend_writeback(struct address_space *mapping,
- struct afs_vnode *vnode,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool new_content,
- bool caching,
- unsigned int *_len)
+static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
{
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned long priv;
- unsigned int psize, filler = 0;
- unsigned int f, t;
- loff_t len = *_len;
- pgoff_t index = (start + len) / PAGE_SIZE;
- bool stop = true;
- unsigned int i;
-
- XA_STATE(xas, &mapping->i_pages, index);
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(&xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(&xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio_index(folio) != index)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(&xas);
- continue;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(&xas))) {
- folio_put(folio);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- psize = folio_size(folio);
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (f != 0 && !new_content) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- len += filler + t;
- filler = psize - t;
- if (len >= max_len || *_count <= 0)
- stop = true;
- else if (t == psize || new_content)
- stop = false;
-
- index += folio_nr_pages(folio);
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- if (!stop)
- xas_pause(&xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- *_count -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ ssize_t ret;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
+ _enter("%x[%x],%zx",
+ subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
- *_len = len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
+ subreq->rreq->origin == NETFS_LAUNDER_WRITE);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
+ false);
}
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct folio *folio,
- loff_t start, loff_t end)
+static void afs_upload_to_server_worker(struct work_struct *work)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct iov_iter iter;
- unsigned long priv;
- unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->netfs.inode);
- bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
-
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- priv = (unsigned long)folio_get_private(folio);
- offset = afs_folio_dirty_from(folio, priv);
- to = afs_folio_dirty_to(folio, priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
-
- len = to - offset;
- start += offset;
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len &&
- (to == folio_size(folio) || new_content))
- afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content,
- caching, &len);
- len = min_t(loff_t, len, max_len);
- }
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
-
- if (start < i_size) {
- _debug("write back %x @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- afs_write_to_cache(vnode, start, len, i_size, caching);
-
- iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
- ret = afs_store_data(vnode, &iter, start, false);
- } else {
- _debug("write discard %x @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- afs_pages_written_back(vnode, start, len);
- ret = 0;
- }
-
- switch (ret) {
- case 0:
- wbc->nr_to_write = count;
- ret = len;
- break;
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
- default:
- pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
-
- case -EDQUOT:
- case -ENOSPC:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, -ENOSPC);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
- }
-
- _leave(" = %d", ret);
- return ret;
+ afs_upload_to_server(subreq);
}
/*
- * write a region of pages back to the server
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
*/
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop)
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
{
- struct folio *folio;
- struct folio_batch fbatch;
- ssize_t ret;
- unsigned int i;
- int n, skips = 0;
-
- _enter("%llx,%llx,", start, end);
- folio_batch_init(&fbatch);
-
- do {
- pgoff_t index = start / PAGE_SIZE;
-
- n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, &fbatch);
-
- if (!n)
- break;
- for (i = 0; i < n; i++) {
- folio = fbatch.folios[i];
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio_index(folio));
-
- /* At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
- */
-try_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0) {
- folio_batch_release(&fbatch);
- return ret;
- }
- } else {
- if (!folio_trylock(folio))
- continue;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- continue;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto try_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- *_next = start;
- folio_batch_release(&fbatch);
- _leave(" = 0 [%llx]", *_next);
- return 0;
- }
- skips++;
- }
- continue;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- ret = afs_write_back_from_locked_folio(mapping, wbc,
- folio, start, end);
- if (ret < 0) {
- _leave(" = %zd", ret);
- folio_batch_release(&fbatch);
- return ret;
- }
-
- start += ret;
- }
+ struct netfs_io_subrequest *subreq;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (wbc->nr_to_write > 0);
+ _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
- *_next = start;
- _leave(" = 0 [%llx]", *_next);
- return 0;
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, afs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
}
/*
* write some of the pending data back to the server
*/
-int afs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int afs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- loff_t start, next;
int ret;
- _enter("");
-
/* We have to be careful as we can end up racing with setattr()
* truncating the pagecache since the caller doesn't take a lock here
* to prevent it.
@@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping,
else if (!down_read_trylock(&vnode->validate_lock))
return 0;
- if (wbc->range_cyclic) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
- &next, false);
- if (ret == 0) {
- mapping->writeback_index = next / PAGE_SIZE;
- if (start > 0 && wbc->nr_to_write > 0) {
- ret = afs_writepages_region(mapping, wbc, 0,
- start, &next, false);
- if (ret == 0)
- mapping->writeback_index =
- next / PAGE_SIZE;
- }
- }
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
- &next, false);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = next / PAGE_SIZE;
- } else {
- ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end,
- &next, false);
- }
-
+ ret = netfs_writepages(mapping, wbc);
up_read(&vnode->validate_lock);
- _leave(" = %d", ret);
return ret;
}
/*
- * write to an AFS file
- */
-ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
- struct afs_file *af = iocb->ki_filp->private_data;
- ssize_t result;
- size_t count = iov_iter_count(from);
-
- _enter("{%llx:%llu},{%zu},",
- vnode->fid.vid, vnode->fid.vnode, count);
-
- if (IS_SWAPFILE(&vnode->netfs.inode)) {
- printk(KERN_INFO
- "AFS: Attempt to write to active swap file!\n");
- return -EBUSY;
- }
-
- if (!count)
- return 0;
-
- result = afs_validate(vnode, af->key);
- if (result < 0)
- return result;
-
- result = generic_file_write_iter(iocb, from);
-
- _leave(" = %zd", result);
- return result;
-}
-
-/*
* flush any dirty pages for this process, and check for write errors.
* - the return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
@@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
- struct inode *inode = file_inode(file);
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_file *af = file->private_data;
- unsigned long priv;
- vm_fault_t ret = VM_FAULT_RETRY;
-
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- afs_validate(vnode, af->key);
- sb_start_pagefault(inode->i_sb);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- goto out;
-#endif
-
- if (folio_wait_writeback_killable(folio))
- goto out;
-
- if (folio_lock_killable(folio) < 0)
- goto out;
-
- /* We mustn't change folio->private until writeback is complete as that
- * details the portion of the page we need to write back and we might
- * need to redirty the page if there's a problem.
- */
- if (folio_wait_writeback_killable(folio) < 0) {
- folio_unlock(folio);
- goto out;
- }
-
- priv = afs_folio_dirty(folio, 0, folio_size(folio));
- priv = afs_folio_dirty_mmapped(priv);
- if (folio_test_private(folio)) {
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
- } else {
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
- }
- file_update_time(file);
-
- ret = VM_FAULT_LOCKED;
-out:
- sb_end_pagefault(inode->i_sb);
- return ret;
+ if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0)
+ return VM_FAULT_SIGBUS;
+ return netfs_page_mkwrite(vmf, NULL);
}
/*
@@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
afs_put_wb_key(wbk);
}
}
-
-/*
- * Clean up a page during invalidation.
- */
-int afs_launder_folio(struct folio *folio)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- struct iov_iter iter;
- struct bio_vec bv;
- unsigned long priv;
- unsigned int f, t;
- int ret = 0;
-
- _enter("{%lx}", folio->index);
-
- priv = (unsigned long)folio_get_private(folio);
- if (folio_clear_dirty_for_io(folio)) {
- f = 0;
- t = folio_size(folio);
- if (folio_test_private(folio)) {
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- }
-
- bvec_set_folio(&bv, folio, t - f, f);
- iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-
- trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
- ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
- folio_detach_private(folio);
- folio_wait_fscache(folio);
- return ret;
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct afs_vnode *vnode = priv;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS)
- afs_invalidate_cache(vnode, 0);
-}
-
-/*
- * Save the write to the cache also.
- */
-static void afs_write_to_cache(struct afs_vnode *vnode,
- loff_t start, size_t len, loff_t i_size,
- bool caching)
-{
- fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->netfs.inode.i_mapping, start, len, i_size,
- afs_write_to_cache_done, vnode, caching);
-}
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 7423a3557c68..1a05cecda7cc 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
- counters.o \
darray.o \
debug.o \
dirent.o \
@@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a09b9d00226a..10704f2d3af5 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
+ bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
@@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "gen %u oldest_gen %u data_type %s",
- a->gen, a->oldest_gen,
- a->data_type < BCH_DATA_NR
- ? bch2_data_types[a->data_type]
- : "(invalid data type)");
+ prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+ bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- prt_newline(out);
-
- if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
- struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
- const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
- prt_newline(out);
- bch2_backpointer_to_text(out, &bps[i]);
- }
-
- printbuf_indent_sub(out, 2);
- }
-
printbuf_indent_sub(out, 2);
}
@@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
}
- if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
@@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 discarded;
+ struct bch_dev *ca;
+ u64 need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+ if (s->ca == ca)
+ return;
+
+ if (s->ca && s->need_journal_commit_this_dev >
+ bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ s->ca = ca;
+ s->need_journal_commit_this_dev = 0;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- u64 *seen,
- u64 *open,
- u64 *need_journal_commit,
- u64 *discarded)
+ struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
+
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
+ discard_buckets_next_dev(c, s, ca);
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- (*open)++;
+ s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
- (*need_journal_commit)++;
+ s->need_journal_commit++;
+ s->need_journal_commit_this_dev++;
goto out;
}
@@ -1732,9 +1738,9 @@ write:
goto out;
count_event(c, bucket_discard);
- (*discarded)++;
+ s->discarded++;
out:
- (*seen)++;
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@@ -1744,7 +1750,7 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work)
ret = bch2_trans_run(c,
for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded)));
-
- if (need_journal_commit * 2 > seen)
- bch2_journal_flush_async(&c->journal, NULL);
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ discard_buckets_next_dev(c, &s, NULL);
- trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
new file mode 100644
index 000000000000..b4ec20be93b8
--- /dev/null
+++ b/fs/bcachefs/alloc_background_format.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index b0ff47998a94..633d3223b353 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
- atomic_read(&ob->pin),
- data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ atomic_read(&ob->pin));
+ bch2_prt_data_type(out, data_type);
+ prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e358a2ffffde..b4dc319bcb2b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -400,13 +400,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
return ret;
}
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+ struct bpos bucket_start;
+ struct bpos bucket_end;
+ struct bkey_buf last_flushed;
+};
+
static int check_bp_exists(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
- struct bkey_s_c orig_k,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed)
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
@@ -417,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
- if (bpos_lt(bucket, bucket_start) ||
- bpos_gt(bucket, bucket_end))
+ if (bpos_lt(bucket, s->bucket_start) ||
+ bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@@ -433,11 +444,9 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) ||
- bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) ||
- memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) {
- bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+ bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+ if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
if (bp.level) {
bch2_trans_unlock(trans);
bch2_btree_interior_updates_flush(c);
@@ -447,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans,
if (ret)
goto err;
- bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+ bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
@@ -475,10 +484,8 @@ missing:
}
static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree, unsigned level,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -498,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
+ ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@@ -509,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree_id,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed,
int *level)
{
struct bch_fs *c = trans->c;
@@ -536,9 +539,7 @@ retry:
*level = b->c.level;
k = bkey_i_to_s_c(&b->key);
- ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
- bucket_start, bucket_end,
- last_flushed, k);
+ ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, btree_bytes(c));
+ return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct bpos bucket_start,
- struct bpos bucket_end)
+ struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- enum btree_id btree_id;
- struct bkey_s_c k;
- struct bkey_buf last_flushed;
int ret = 0;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+ for (enum btree_id btree_id = 0;
+ btree_id < btree_id_nr_alive(c);
+ btree_id++) {
int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
- check_btree_root_to_backpointers(trans, btree_id,
- bucket_start, bucket_end,
- &last_flushed, &level));
+ check_btree_root_to_backpointers(trans, s, btree_id, &level));
if (ret)
return ret;
while (level >= depth) {
+ struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
level,
BTREE_ITER_PREFETCH);
while (1) {
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
ret = bkey_err(k) ?:
- check_extent_to_backpointers(trans, btree_id, level,
- bucket_start, bucket_end,
- &last_flushed, k) ?:
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
@@ -668,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
}
}
- bch2_bkey_buf_exit(&last_flushed, c);
return 0;
}
@@ -731,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct bpos start = POS_MIN, end;
+ struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
+ bch2_bkey_buf_init(&s.last_flushed);
+ bkey_init(&s.last_flushed.k->k);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
- if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ if ( bpos_eq(s.bucket_start, POS_MIN) &&
+ !bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ if (!bpos_eq(s.bucket_start, POS_MIN) ||
+ !bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, start);
+ bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, end);
+ bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
- if (ret || bpos_eq(end, SPOS_MAX))
+ ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+ if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
- start = bpos_successor(end);
+ s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&s.last_flushed, c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 737e2396ade7..327365a9feac 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dac383e37181..b80c6c9efd8c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> 9;
-}
-
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0d5ac4184fbc..0668b682a21c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -417,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX 4096
-
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
- __u8 d_type;
-
- __u8 d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- __u64 fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
-
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@@ -1021,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
- __u8 pad;
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- */
- __le32 parent;
- __le32 pad;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-/* Snapshots */
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
/* LRU btree: */
struct bch_lru {
@@ -1178,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1230,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@@ -1465,23 +715,6 @@ struct bch_sb_field_replicas {
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
-} __packed __aligned(8);
-
/* BCH_SB_FIELD_disk_groups: */
#define BCH_SB_LABEL_SIZE 32
@@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76) \
- x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abdb05507d16..76e79a15ba08 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
- bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+ bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 761f5e33b1e6..5e52684764eb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+ prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
+ .val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index ee82283722b7..03efe8ee565a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -83,9 +83,10 @@ enum btree_update_flags {
__BTREE_TRIGGER_NORUN,
__BTREE_TRIGGER_TRANSACTIONAL,
+ __BTREE_TRIGGER_ATOMIC,
+ __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
};
@@ -107,6 +108,10 @@ enum btree_update_flags {
* causing us to go emergency read-only)
*/
#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
@@ -114,9 +119,6 @@ enum btree_update_flags {
/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-
/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 74bf8eb90a4c..3fd1085b6c61 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
- unsigned j, cacheline = 1;
+ unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
- struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
- BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 632c2b8c5460..79c77baaa383 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
- struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8e2488a4b58d..d7c81beac14a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_bytes(c), gfp);
+ b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(btree_bytes(c));
+ b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, btree_bytes(c));
+ kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
+ btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4e1af5882052..6d33885fdbde 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
{
- return c->opts.btree_node_size;
+ return 1UL << b->byte_order;
}
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+ return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
{
- return btree_bytes(c) / PAGE_SIZE;
+ return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 49b4ade758c3..1102995643b1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->data_type],
- bch2_data_types[data_type],
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
@@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_types[i]);
+ d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_types[i]);
+ d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
}
@@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
- hidden, "hidden");
+ b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
- btree, "btree");
+ b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
- data, "data");
+ b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
- cached, "cached");
+ b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
- reserved, "reserved");
+ b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
- nr_inodes,"nr_inodes");
+ b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
- bch2_data_types[new.data_type],
- bch2_data_types[gc.data_type]))
+ bch2_data_type_str(new.data_type),
+ bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
- bch2_data_types[gc.data_type], \
+ bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 33db48e2153f..aa9b6cbe3226 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(size > btree_bytes(c));
+ BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
- for (k = unwritten_whiteouts_start(c, b);
- k != unwritten_whiteouts_end(c, b);
+ for (k = unwritten_whiteouts_start(b);
+ k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
- memcpy_u64s(unwritten_whiteouts_start(c, b),
+ memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
- ? btree_bytes(c)
+ ? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(bytes != btree_bytes(c));
+ BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
+ bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
- sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1512,7 +1512,7 @@ fsck_err:
}
if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_bytes(c));
+ memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_bytes(c)),
+ buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_bytes(c)),
+ buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_bytes(c));
+ bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -2074,8 +2074,8 @@ do_write:
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_start(b),
+ unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fa298289e016..5467a8635be1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
if (path->should_be_locked &&
!trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index da2b74fa63fc..24772538e4cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 2d1c95c42f24..bed75c93c069 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans)
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path, i)
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
+
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
- trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ !btree_path_get_locks(trans, path, false, &f)) {
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=",
+ f.l, path->l[f.l].lock_seq);
+ if (IS_ERR_OR_NULL(f.b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+ } else {
+ prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f.b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+ }
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
+ }
+
return 0;
}
@@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ !bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index cc5500a957a1..4bd72c855da1 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
- struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 90eb8065ff2d..30d69a6d133e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
+ push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
+ if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
- return 0;
-
if (old_ops->trigger == new_ops->trigger) {
ret = bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
@@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+ gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ return 0;
}
static inline int
@@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
+
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
@@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
+ if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->journal_entries_u64s)
goto out_reset;
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d530307046f4..4a5a64499eb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -430,6 +430,9 @@ struct btree_trans {
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
+
+ struct bch_fs_usage_base fs_usage_delta;
+
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
@@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -738,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 44f9dfa28a09..17a5938aa71a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
@@ -1097,7 +1097,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
@@ -1401,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@@ -1703,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
- if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index adfc62083844..c593c925d1e3 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
{
- return (void *) b->data + btree_bytes(c);
+ return (void *) b->data + btree_buf_bytes(b);
}
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+ return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
- return btree_data_end(c, b);
+ return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
- struct btree *b,
- void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size >> 3;
+ ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
- ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, bne->keys.start);
+ __bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
- BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
+ return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 5c1169c78daf..ac7844861966 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
struct btree_write_buffered_key *wb,
bool *write_locked, size_t *fast)
{
- struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
EBUG_ON(!wb->journal_seq);
- EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
- EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+ EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
ret = bch2_btree_iter_traverse(iter);
if (ret)
@@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
*write_locked = true;
}
- if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
+ if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
return wb_flush_one_slowpath(trans, iter, wb);
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d83ea0e53df3..54f7826ac498 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -25,7 +25,7 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
bch2_fs_usage_acc_to_base(c, i);
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->reserved += usage->persistent_reserved[i];
+ usage->b.reserved += usage->persistent_reserved[i];
for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
+ usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.hidden);
+ fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.data);
+ fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.cached);
+ fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.reserved);
+ fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.nr_inodes);
+ fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->u.hidden +
- fs_usage->u.btree +
- fs_usage->u.data +
- reserve_factor(fs_usage->u.reserved +
+ return min(fs_usage->u.b.hidden +
+ fs_usage->u.b.btree +
+ fs_usage->u.b.data +
+ reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
- bch2_fs_usage_read_one(c, &c->usage_base->btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
prt_newline(out);
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- prt_str(out, bch2_data_types[i]);
+ bch2_prt_data_type(out, i);
prt_tab(out);
prt_u64(out, usage->d[i].buckets);
prt_tab_rjust(out);
@@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
fs_usage = fs_usage_ptr(c, journal_seq, gc);
if (data_type_is_hidden(old->data_type))
- fs_usage->hidden -= ca->mi.bucket_size;
+ fs_usage->b.hidden -= ca->mi.bucket_size;
if (data_type_is_hidden(new->data_type))
- fs_usage->hidden += ca->mi.bucket_size;
+ fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
@@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
@@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
- bch2_data_types[g->data_type],
- bch2_data_types[data_type])) {
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
- bch2_data_types[g->data_type ?: data_type],
+ bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
@@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type],
- bch2_data_types[ptr_data_type],
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
- dst->nr_inodes -= deltas->nr_inodes;
+ dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
- dst->reserved -= deltas->persistent_reserved[i];
+ dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- s64 added = 0, should_not_have_added;
- unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
+ struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
- }
-
- dst->nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added += deltas->persistent_reserved[i];
- dst->reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
+ s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
- should_not_have_added = added - (s64) disk_res_sectors;
+ s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
+ dst->hidden += src->hidden;
+ dst->btree += src->btree;
+ dst->data += src->data;
+ dst->cached += src->cached;
+ dst->reserved += src->reserved;
+ dst->nr_inodes += src->nr_inodes;
+
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d))
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+
+ dst->b.nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ dst->b.reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@@ -1084,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans,
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->reserved += sectors;
+ fs_usage->b.reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
preempt_enable();
@@ -1130,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- bch2_data_types[type],
- bch2_data_types[type]);
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
ret = -EIO;
goto err;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2c95cc5d86be..6387e039f789 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
ret; \
})
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
+
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+ return type < BCH_DATA_NR
+ ? __bch2_data_types[type]
+ : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+ if (type < BCH_DATA_NR)
+ prt_str(out, __bch2_data_types[type]);
+ else
+ prt_printf(out, "(invalid data type %u)", type);
+}
+
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 783f71017204..6a31740222a7 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -45,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
+};
- /* XXX: add stats for compression ratio */
-#if 0
- u64 uncompressed;
- u64 compressed;
-#endif
-
- /* broken out: */
-
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f41889093a2c..363644451106 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
- while (1) {
+ do {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
schedule();
try_to_freeze();
- }
+ } while (0);
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..58c2eb45570f 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+ if (type < BCH_COMPRESSION_TYPE_NR)
+ prt_str(out, __bch2_compression_types[type]);
+ else
+ prt_printf(out, "(invalid compression type %u)", type);
+}
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 6f13477ff652..4150feca42a2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -285,9 +285,7 @@ restart_drop_extra_replicas:
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
@@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d6418948495f..cadda9bbe4a4 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_bytes(c)),
+ buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_bytes(c));
+ bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
+ memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_bytes(c)),
+ buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+ bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_bytes(c));
+ kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
new file mode 100644
index 000000000000..5e116b88e814
--- /dev/null
+++ b/fs/bcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d802bc63c8d0..d503af270024 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
+ bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
a->v.stripe, s.k->p.offset)) {
ret = -EIO;
@@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
+ bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
@@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) {
+ if (flags & BTREE_TRIGGER_ATOMIC) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
new file mode 100644
index 000000000000..44ce88ba08d7
--- /dev/null
+++ b/fs/bcachefs/ec_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 82ec056f4cdb..61395b113df9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type],
- bch2_compression_types[crc.compression_type]);
+ bch2_csum_types[crc.csum_type]);
+ bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- unsigned target, unsigned compression)
+ struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
+ unsigned target = opts->background_target;
+ unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index a855c94d43dd..6bf839d69e84 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- unsigned, unsigned);
+ struct bch_io_opts *);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
new file mode 100644
index 000000000000..3bd2fdbb0817
--- /dev/null
+++ b/fs/bcachefs/extents_format.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 9637f636e32d..b04750dbf870 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
- for ((_i) = eytzinger1_first((_size)); \
+ for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
- for ((_i) = eytzinger0_first((_size)); \
+ for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index fdd57c5785c9..e3b219e19e10 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ /* bios must be 512 byte aligned: */
+ if ((offset|iter->count) & (SECTOR_SIZE - 1))
+ return -EINVAL;
+
ret = min_t(loff_t, iter->count,
max_t(loff_t, 0, i_size_read(&inode->v) - offset));
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index ff664fd0d8ef..d359aa9b33b8 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 *start, u64 end,
+ bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
- unsigned i, j;
+ int ret = 0;
- if (end <= start)
- return;
+ if (end <= *start)
+ return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
+
+ if (!nonblocking)
+ folio_lock(folio);
+ else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ ret = -EAGAIN;
+ break;
+ }
+
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
BUG_ON(end <= folio_start);
- folio_lock(folio);
- s = bch2_folio(folio);
+ *start = min(end, folio_end);
+ struct bch_folio *s = bch2_folio(folio);
if (s) {
+ unsigned folio_offset = max(*start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 27f712ae37a6..8cbaba6565b4 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 98bd5babab19..dc52918d06ef 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- drop_locks_do(trans,
- (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ if (bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, true))
+ drop_locks_do(trans,
+ bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 1cbc5807bc80..3a4c24c28e7f 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
- /* why do we need this lock? */
- down_read(&c->vfs_sb->s_umount);
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+ /* sync_inodes_sb enforce s_umount is locked */
+ down_read(&c->vfs_sb->s_umount);
sync_inodes_sb(c->vfs_sb);
+ up_read(&c->vfs_sb->s_umount);
+ }
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
@@ -425,8 +426,6 @@ err2:
goto retry;
}
err1:
- up_read(&c->vfs_sb->s_umount);
-
return error;
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 37dce96f48ac..086f0090b03a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -506,22 +506,33 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode=%o ", inode->bi_mode);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "mode=%o", inode->bi_mode);
+ prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
+ prt_newline(out);
- prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
- inode->bi_journal_seq,
- inode->bi_size,
- inode->bi_sectors,
- inode->bi_version);
+ prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_newline(out);
+
+ prt_printf(out, "bi_size=%llu", inode->bi_size);
+ prt_newline(out);
+
+ prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ prt_printf(out, #_name "=%llu", (u64) inode->_name); \
+ prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
+ printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
}
- if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
percpu_down_read(&c->mark_lock);
- this_cpu_add(c->usage_gc->nr_inodes, nr);
+ this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
new file mode 100644
index 000000000000..83d107331edf
--- /dev/null
+++ b/fs/bcachefs/inode_format.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX 4096
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index ca6d5f516aa2..1baf78594cca 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 33c0e783d546..ef3a53f9045a 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -1447,10 +1445,11 @@ err:
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8538ef34f62b..d71d26e39521 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ unsigned i = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + i;
+
+ prt_printf(out, "seq:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "size:");
+ prt_tab(out);
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++)
+ bch2_journal_buf_to_text(out, j, seq);
+}
+
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- trace_journal_entry_close(c, vstruct_bytes(buf->data));
+ if (trace_journal_entry_close_enabled() && trace) {
+ struct printbuf pbuf = PRINTBUF;
+ pbuf.atomic++;
+
+ prt_str(&pbuf, "entry size: ");
+ prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+ prt_newline(&pbuf);
+ bch2_prt_task_backtrace(&pbuf, current, 1);
+ trace_journal_entry_close(c, pbuf.buf);
+ printbuf_exit(&pbuf);
+ }
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
@@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@@ -463,13 +514,21 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
- trace_and_count(c, journal_entry_full, c);
+ if (trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ count_event(c, journal_entry_full);
}
unlock:
can_discard = j->can_discard;
@@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -606,7 +665,7 @@ recheck_need_open:
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
if (buf->need_flush_to_write_buffer) {
if (seq == journal_cur_seq(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
union journal_res_state s;
s.v = atomic64_read_acquire(&j->reservations.counter);
@@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
-
- for (u64 seq = journal_cur_seq(j);
- seq >= journal_last_unwritten_seq(j);
- --seq) {
- unsigned i = seq & JOURNAL_BUF_MASK;
-
- prt_printf(out, "unwritten entry:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
-
- prt_printf(out, "sectors:");
- prt_tab(out);
- prt_printf(out, "%u", j->buf[i].sectors);
- prt_newline(out);
-
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
+ prt_printf(out, "unwritten entries:");
+ prt_newline(out);
+ bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b0f4dd491e12..04a1e79a5ed3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
- if (i < BCH_DATA_NR)
- prt_printf(out, " %s", bch2_data_types[i]);
- else
- prt_printf(out, " (unknown data type %u)", i);
+ bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
new file mode 100644
index 000000000000..6a4bf7129dba
--- /dev/null
+++ b/fs/bcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7a33319dcd16..bf68ea49447b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs: ");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target: ");
+ prt_tab(out);
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression: ");
+ prt_tab(out);
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas: ");
+ prt_tab(out);
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@@ -111,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
+ if (trace_move_extent_write_enabled()) {
+ struct bch_fs *c = io->write.op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+ trace_move_extent_write(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ trace_move_extent2(c, k, &io_opts, &data_opts);
+
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
- trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
+ unsigned sectors = btree_ptr_sectors_written(&b->key);
+
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate,
- c->opts.btree_node_size >> 9);
+ bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c,
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- prt_printf(out, "%s: data type=%s pos=",
- stats->name,
- bch2_data_types[stats->data_type]);
+ prt_printf(out, "%s: data type==", stats->name);
+ bch2_prt_data_type(out, stats->data_type);
+ prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8e6f230eac38..b1ed0b9a20d3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 93a24fef4214..9a4b7faa3765 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@@ -564,6 +564,11 @@ struct bch_io_opts {
#undef x
};
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+ return opts.background_compression ?: opts.compression;
+}
+
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
new file mode 100644
index 000000000000..dc34347ef6c7
--- /dev/null
+++ b/fs/bcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 95f46cb3b5bd..22d1017aa49b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
prt_str(&buf, "target=");
bch2_target_to_text(&buf, c, r->target);
prt_str(&buf, " compression=");
- struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
- prt_str(&buf, bch2_compression_opts[opt.type]);
+ bch2_compression_opt_to_text(&buf, r->compression);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
@@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
- compression = io_opts->background_compression ?: io_opts->compression;
+ compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression :
- (io_opts->background_compression ?: io_opts->compression);
+ compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
@@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
- int ret;
set_freezable();
@@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
- while (!kthread_should_stop() &&
- !(ret = do_rebalance(&ctxt)))
+ while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 725214605a05..9127d0e3ca2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
- c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index faa5d3670058..c47c66c2b394 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f
}
}
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
(flags & BTREE_TRIGGER_INSERT))
@@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
@@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ if (dst_inum.inum < src_inum.inum) {
+ /* Avoid some lock cycle transaction restarts */
+ ret = bch2_btree_iter_traverse(&dst_iter);
+ if (ret)
+ continue;
+ }
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -538,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8ee778ec0022..4d8867289717 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
- .trigger = bch2_trans_mark_reflink_v, \
+ .trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
unsigned);
@@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
- .trigger = bch2_trans_mark_indirect_inline_data, \
+ .trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
new file mode 100644
index 000000000000..6772eebb1fc6
--- /dev/null
+++ b/fs/bcachefs/reflink_format.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 92ba56ef1fc8..cc2672c12031 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -9,6 +9,12 @@
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r, size_t size)
+{
+ return memcmp(l, r, size);
+}
+
/* Replicas tracking - in memory: */
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
@@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry_v1 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
- memcmp, NULL);
+ bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 9632f36f5f31..b6bf0ebe7e84 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c
index 02a996e06a64..7dc898761bb3 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
/* BCH_SB_FIELD_counters */
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h
index 4778aa19bf34..81f8aec9fcb1 100644
--- a/fs/bcachefs/counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
-
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
new file mode 100644
index 000000000000..62ea478215d0
--- /dev/null
+++ b/fs/bcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76) \
+ x(write_buffer_flush_slowpath, 77) \
+ x(write_buffer_flush_sync, 78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index a44a238bf8b5..a45354d2acde 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
+ prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 56af937523ff..45f67e8b29eb 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
+ n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c)
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
- kfree(rcu_dereference_protected(c->snapshots, true));
+ kvfree(rcu_dereference_protected(c->snapshots, true));
}
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
new file mode 100644
index 000000000000..aabcd3a74cd9
--- /dev/null
+++ b/fs/bcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+ bch_le128 btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
new file mode 100644
index 000000000000..af79134b07d6
--- /dev/null
+++ b/fs/bcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ *
+ * This is _not_ necessarily the subvolume of the directory containing
+ * this subvolume:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6d3db5cce5f6..d60c7d27a047 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
-#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@@ -1321,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Superblock size:");
prt_tab(out);
- prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_units_u64(out, vstruct_bytes(sb));
+ prt_str(out, "/");
+ prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9dbc35940197..b9911402b175 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
-#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- btree_bytes(c)) ||
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
prt_bdevname(&name, ca->disk_sb.bdev);
if (c->sb.nr_devices == 1)
- strlcpy(c->name, name.buf, sizeof(c->name));
- strlcpy(ca->name, name.buf, sizeof(ca->name));
+ strscpy(c->name, name.buf, sizeof(c->name));
+ strscpy(ca->name, name.buf, sizeof(ca->name));
printbuf_exit(&name);
@@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) {
struct printbuf data_has = PRINTBUF;
- prt_bitflags(&data_has, bch2_data_types, data);
+ prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8ed52319ff68..cee80c47feea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
- ret += btree_bytes(c);
+ ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
- prt_str(out, bch2_compression_types[i]);
+ bch2_prt_compression_type(out, i);
prt_tab(out);
prt_human_readable_u64(out, s[i].sectors_compressed << 9);
@@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
- if ((id == Opt_background_target ||
- id == Opt_background_compression) && v)
+ if (v &&
+ (id == Opt_background_target ||
+ id == Opt_background_compression ||
+ (id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
- bch2_data_types[i],
+ bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@@ -908,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
- prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c94876b3bb06..293b90d704fb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str,
__assign_str(str, str);
),
- TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
+ TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
DECLARE_EVENT_CLASS(trans_str,
@@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, journal_entry_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(journal_entry_close,
- TP_PROTO(struct bch_fs *c, unsigned bytes),
- TP_ARGS(c, bytes),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, bytes )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->bytes = bytes;
- ),
-
- TP_printk("%d,%d entry bytes %u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->bytes)
+DEFINE_EVENT(fs_str, journal_entry_close,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate,
);
DEFINE_EVENT(fs_str, move_extent,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_read,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_write,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_finish,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(move_extent_fail,
- TP_PROTO(struct bch_fs *c, const char *msg),
- TP_ARGS(c, msg),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(msg, msg )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(msg, msg);
- ),
-
- TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_start_fail,
@@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-struct get_locks_fail;
-
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->node_seq)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index c2ef7cddaa4f..a135136adeee 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+ bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@@ -1186,7 +1191,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
{
darray_init(ret);
- char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name;
+ char *dev_name, *s, *orig;
+
+ dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
return -ENOMEM;
@@ -1201,10 +1208,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
}
}
- kfree(dev_name);
+ kfree(orig);
return 0;
err:
bch2_darray_str_exit(ret);
- kfree(dev_name);
+ kfree(orig);
return -ENOMEM;
}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c75fc31915d3..df67bf55fe2b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -342,7 +342,8 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5a1858fb9879..9c0d2316031b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,8 +590,9 @@ err:
mutex_unlock(&inode->ei_update_lock);
if (value &&
- (opt_id == Opt_background_compression ||
- opt_id == Opt_background_target))
+ (opt_id == Opt_background_target ||
+ opt_id == Opt_background_compression ||
+ (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
new file mode 100644
index 000000000000..e9f810539552
--- /dev/null
+++ b/fs/bcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 193168214eeb..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 93cc92974dee..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396aba92c57..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 809b11472a80..1eb93d3962aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
u64 root_flags;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -4563,15 +4565,17 @@ out_end_trans:
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41b479861b3c..dfed9dd9c2d7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1131d5a29d61..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a01807cbd4d4..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1098,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..2d7519a6ce72 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 93511d54abf8..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- folio_start_writeback(folio);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 896acfda1789..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc)
btrfs_info_to_ctx(fs_info, &old_ctx);
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2..d67785be2c77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
map = btrfs_find_chunk_map(fs_info, logical, length);
if (unlikely(!map)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
@@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
}
if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
logical, logical + length, map->start,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 36cf1f0e338e..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -354,18 +354,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5bd76813b23f..168af9d000d1 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2055,6 +2055,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2067,7 +2068,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2087,20 +2087,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2108,8 +2105,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2117,6 +2112,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 8df715640a48..c5a070550ee3 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -2,7 +2,7 @@
config CACHEFILES
tristate "Filesystem caching on files"
- depends on FSCACHE && BLOCK
+ depends on NETFS_SUPPORT && FSCACHE && BLOCK
help
This permits use of a mounted filesystem as a cache for other
filesystems - primarily networking filesystems - thus allowing fast
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 4a87c9d714a9..d33169f0018b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
extern int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet);
extern int __cachefiles_write(struct cachefiles_object *object,
struct file *file,
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 5857241c5918..1d685357e67f 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet)
{
struct cachefiles_cache *cache = object->volume->cache;
loff_t start = *_start, pos;
- size_t len = *_len, down;
+ size_t len = *_len;
int ret;
/* Round to DIO size */
- down = start - round_down(start, PAGE_SIZE);
- *_start = start - down;
- *_len = round_up(down + len, PAGE_SIZE);
+ start = round_down(*_start, PAGE_SIZE);
+ if (start != *_start || *_len > upper_len) {
+ /* Probably asked to cache a streaming write written into the
+ * pagecache when the cookie was temporarily out of service to
+ * culling.
+ */
+ fscache_count_dio_misfit();
+ return -ENOBUFS;
+ }
+
+ *_len = round_up(len, PAGE_SIZE);
/* We need to work out whether there's sufficient disk space to perform
* the write - but we can skip that check if we have space already
@@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_DATA);
+ pos = vfs_llseek(file, start, SEEK_DATA);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
if (pos == -ENXIO)
goto check_space; /* Unallocated tail */
@@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
goto check_space; /* Unallocated region */
/* We have a block that's at least partially filled - if we're low on
@@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_HOLE);
+ pos = vfs_llseek(file, start, SEEK_HOLE);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), pos,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
return 0; /* Fully allocated */
/* Partially allocated, but insufficient space: cull. */
@@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
ret = cachefiles_inject_remove_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- *_start, *_len);
+ start, *_len);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
@@ -591,8 +599,8 @@ check_space:
}
static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+ loff_t *_start, size_t *_len, size_t upper_len,
+ loff_t i_size, bool no_space_allocated_yet)
{
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
@@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
cachefiles_begin_secure(cache, &saved_cred);
ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
- _start, _len,
+ _start, _len, upper_len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index b8fbbb1961bb..4ba42f1fa3b4 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
return -ENOBUFS;
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
return ret;
@@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
struct fscache_volume *volume = object->volume->vcookie;
size_t volume_key_size, cookie_key_size, data_len;
+ if (!object->ondemand)
+ return 0;
+
/*
* CacheFiles will firstly check the cache file under the root cache
* directory. If the coherency check failed, it will fallback to
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 94df854147d3..7249d70e1a43 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -7,6 +7,7 @@ config CEPH_FS
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
default n
help
Choose Y or M here to include support for mounting the
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 13af429ab030..1340d77124ae 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
ceph_put_snap_context(snapc);
}
- folio_wait_fscache(folio);
-}
-
-static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct inode *inode = folio->mapping->host;
- struct ceph_client *cl = ceph_inode_to_client(inode);
-
- doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
-
- if (folio_test_private(folio))
- return false;
-
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- ceph_fscache_note_page_release(inode);
- return true;
+ netfs_invalidate_folio(folio, offset, length);
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
u64 len = subreq->len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
+ int extent_cnt;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
- NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
+ CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
if (sparse) {
- err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
if (err)
goto out;
}
@@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
- .begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
@@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .release_folio = ceph_release_folio,
+ .release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
};
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index dc502daac49a..20efac020394 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
}
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
+ return netfs_unpin_writeback(inode, wbc);
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- struct ceph_inode_info *ci = ceph_inode(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
-}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-}
+#define ceph_fscache_dirty_folio netfs_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- fscache_note_page_release(ceph_fscache_cookie(ci));
-}
#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
@@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
- struct writeback_control *wbc)
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
+ return 0;
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- return filemap_dirty_folio(mapping, folio);
-}
+#define ceph_fscache_dirty_folio filemap_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return -ENOBUFS;
-}
-
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-}
#endif /* CONFIG_CEPH_FSCACHE */
#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2c0b8dc3dd0d..9c02f328c966 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4887,13 +4887,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct ceph_client *cl;
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4903,14 +4905,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 678596684596..0e9f56eaba1e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control {
unsigned long dir_lease_ttl;
};
+static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *);
+static int __dentry_lease_check(const struct dentry *);
+
static unsigned long
__dentry_leases_walk(struct ceph_mds_client *mdsc,
- struct ceph_lease_walk_control *lwc,
- int (*check)(struct dentry*, void*))
+ struct ceph_lease_walk_control *lwc)
{
struct ceph_dentry_info *di, *tmp;
struct dentry *dentry, *last = NULL;
@@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc,
goto next;
}
- ret = check(dentry, lwc);
+ if (lwc->dir_lease)
+ ret = __dir_lease_check(dentry, lwc);
+ else
+ ret = __dentry_lease_check(dentry);
if (ret & TOUCH) {
/* move it into tail of dir lease list */
__dentry_dir_lease_touch(mdsc, di);
@@ -1681,7 +1686,7 @@ next:
return freed;
}
-static int __dentry_lease_check(struct dentry *dentry, void *arg)
+static int __dentry_lease_check(const struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret;
@@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg)
return DELETE;
}
-static int __dir_lease_check(struct dentry *dentry, void *arg)
+static int __dir_lease_check(const struct dentry *dentry,
+ struct ceph_lease_walk_control *lwc)
{
- struct ceph_lease_walk_control *lwc = arg;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret = __dir_lease_try_check(dentry);
@@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = false;
lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
- freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ freed = __dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more invalid leases */
return -EAGAIN;
@@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = true;
lwc.expire_dir_lease = freed < count;
lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
- freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ freed +=__dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more to check */
return -EAGAIN;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 726af69d4d62..a79f163ae4ed 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (IS_ERR(inode))
- return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d380d9dad0e0..abe8028d95bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
struct ceph_osd_req_op *op;
u64 read_off = off;
u64 read_len = len;
+ int extent_cnt;
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
@@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
@@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ssize_t len;
struct ceph_osd_req_op *op;
int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+ int extent_cnt;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0679240f06db..0c25d326afc4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
spin_lock_init(&ci->i_ceph_lock);
@@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 02ebfabfc8ee..548d1de379f3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end)
* session message, specialization for CEPH_SESSION_REQUEST_OPEN
* to include additional client metadata fields.
*/
-static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+static struct ceph_msg *
+create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = METRIC_BYTES(count);
extra_bytes += 2 + 4 + 4 + size;
+ /* flags, mds auth caps and oldest_client_tid */
+ extra_bytes += 4 + 4 + 8;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
@@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
end = p + msg->front.iov_len;
h = p;
- h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
/*
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v4
+ * ClientSession messages with metadata are v7
*/
- msg->hdr.version = cpu_to_le16(4);
+ msg->hdr.version = cpu_to_le16(7);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
return ERR_PTR(ret);
}
+ /* version == 5, flags */
+ ceph_encode_32(&p, 0);
+
+ /* version == 6, mds auth caps */
+ ceph_encode_32(&p, 0);
+
+ /* version == 7, oldest_client_tid */
+ ceph_encode_64(&p, mdsc->oldest_tid);
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_open_msg(mdsc, session->s_seq);
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,
+ session->s_seq);
if (IS_ERR(msg))
return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
@@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
doutc(cl, "to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -4128,12 +4142,12 @@ static void handle_session(struct ceph_mds_session *session,
pr_info_client(cl, "mds%d reconnect success\n",
session->s_mds);
+ session->s_features = features;
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
pr_notice_client(cl, "mds%d is already opened\n",
session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
renewed_caps(mdsc, session, 0);
if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
&session->s_features))
@@ -5870,7 +5884,8 @@ static void mds_peer_reset(struct ceph_connection *con)
pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+ if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9d36c3532de1..06ee397e0c3a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
}
/*
- * This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * This function walks through the snaprealm for an inode and set the
+ * realmp with the first snaprealm that has quotas set (max_files,
* max_bytes, or any, depending on the 'which_quota' argument). If the root is
- * reached, return the root ceph_snap_realm instead.
+ * reached, set the realmp with the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* this function will return -EAGAIN; otherwise, the snaprealms walk-through
* will be restarted.
*/
-static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode,
- enum quota_get_realm which_quota,
- bool retry)
+static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
+ enum quota_get_realm which_quota,
+ struct ceph_snap_realm **realmp, bool retry)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
@@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *in;
bool has_quota;
+ if (realmp)
+ *realmp = NULL;
if (ceph_snap(inode) != CEPH_NOSNAP)
- return NULL;
+ return 0;
restart:
realm = ceph_inode(inode)->i_snap_realm;
@@ -250,7 +251,7 @@ restart:
break;
ceph_put_snap_realm(mdsc, realm);
if (!retry)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
goto restart;
}
@@ -259,8 +260,11 @@ restart:
iput(in);
next = realm->parent;
- if (has_quota || !next)
- return realm;
+ if (has_quota || !next) {
+ if (realmp)
+ *realmp = realm;
+ return 0;
+ }
ceph_get_snap_realm(mdsc, next);
ceph_put_snap_realm(mdsc, realm);
@@ -269,7 +273,7 @@ restart:
if (realm)
ceph_put_snap_realm(mdsc, realm);
- return NULL;
+ return 0;
}
bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
@@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
+ int ret;
restart:
/*
@@ -286,9 +291,9 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
- new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
- if (PTR_ERR(new_realm) == -EAGAIN) {
+ get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true);
+ ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false);
+ if (ret == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
ceph_put_snap_realm(mdsc, old_realm);
@@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
- QUOTA_GET_MAX_BYTES, true);
+ get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
+ &realm, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe0f64a0acb2..b06e2bc86221 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -3,6 +3,7 @@
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
@@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+ int cnt = 0;
+
+ if (IS_ENCRYPTED(inode)) {
+ cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+ cnt = 0;
+ }
+
+ return cnt;
+}
+
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 1d318f85232d..fffd3919343e 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ depends on EROFS_FS
+ select NETFS_SUPPORT
+ select FSCACHE
+ select CACHEFILES
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 1d65b9f60a39..072ef6a66823 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -408,7 +408,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
int size, ret = 0;
if (!erofs_sb_has_compr_cfgs(sbi)) {
- sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
}
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 87ff35bff8d5..bc12030393b2 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
- struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
struct erofs_fscache_request *req;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
struct erofs_fscache_request *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9753875e41cb..e313c936351d 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff;
+ unsigned int lclusterbits, endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
@@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_INTERLACED;
- else
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_SHIFTED;
- } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- map->m_algorithmformat = vi->z_algorithmtype[1];
+ afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
+ Z_EROFS_COMPRESSION_INTERLACED :
+ Z_EROFS_COMPRESSION_SHIFTED;
} else {
- map->m_algorithmformat = vi->z_algorithmtype[0];
+ afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
+ vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
+ if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ afmt, vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
}
+ map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
diff --git a/fs/exec.c b/fs/exec.c
index 73e4045df271..af4fbb61cd53 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .open_flag = O_LARGEFILE | O_RDONLY,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
@@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);
#endif /* CONFIG_MMU */
+/*
+ * On success, caller must call do_close_execat() on the returned
+ * struct file to close it.
+ */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
@@ -948,6 +952,17 @@ exit:
return ERR_PTR(err);
}
+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release. Also see
+ * do_close_execat().
+ */
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
@@ -1409,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm)
out_unlock:
up_write(&me->signal->exec_update_lock);
+ if (!bprm->cred)
+ mutex_unlock(&me->signal->cred_guard_mutex);
+
out:
return retval;
}
@@ -1484,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}
+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+ if (!file)
+ return;
+ allow_write_access(file);
+ fput(file);
+}
+
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
@@ -1495,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
+ do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
@@ -1508,12 +1532,23 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
- struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ struct linux_binprm *bprm;
+ struct file *file;
int retval = -ENOMEM;
- if (!bprm)
- goto out;
+
+ file = do_open_execat(fd, filename, flags);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm) {
+ do_close_execat(file);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
@@ -1526,18 +1561,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
if (!bprm->fdpath)
goto out_free;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
+ */
+ if (get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
bprm->filename = bprm->fdpath;
}
bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
- if (retval)
- goto out_free;
- return bprm;
+ if (!retval)
+ return bprm;
out_free:
free_bprm(bprm);
-out:
return ERR_PTR(retval);
}
@@ -1588,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
}
rcu_read_unlock();
+ /* "users" and "in_exec" locked for copy_fs() */
if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
@@ -1804,13 +1850,8 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-/*
- * sys_execve() executes a new program.
- */
-static int bprm_execve(struct linux_binprm *bprm,
- int fd, struct filename *filename, int flags)
+static int bprm_execve(struct linux_binprm *bprm)
{
- struct file *file;
int retval;
retval = prepare_bprm_creds(bprm);
@@ -1826,26 +1867,8 @@ static int bprm_execve(struct linux_binprm *bprm,
current->in_execve = 1;
sched_mm_cid_before_execve(current);
- file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
sched_exec();
- bprm->file = file;
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. This allows the code in exec to
- * choose to fail when the executable is not mmaped into the
- * interpreter and an open file descriptor is not passed to
- * the interpreter. This makes for a better user experience
- * than having the interpreter start and then immediately fail
- * when it finds the executable is inaccessible.
- */
- if (bprm->fdpath && get_close_on_exec(fd))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
-
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
@@ -1875,7 +1898,6 @@ out:
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
-out_unmark:
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1910,7 +1932,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -1959,7 +1981,7 @@ static int do_execveat_common(int fd, struct filename *filename,
bprm->argc = 1;
}
- retval = bprm_execve(bprm, fd, filename, flags);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
@@ -1984,7 +2006,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2019,7 +2041,7 @@ int kernel_execve(const char *kernel_filename,
if (retval < 0)
goto out_free;
- retval = bprm_execve(bprm, fd, filename, 0);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1767493dffda..3d84fcc471c6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_FSCACHE_WB;
- wbc->unpinned_fscache_wb = true;
- dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+ inode->i_state &= ~I_PINNING_NETFS_WB;
+ wbc->unpinned_netfs_wb = true;
+ dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
}
@@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (ret == 0)
ret = err;
}
- wbc->unpinned_fscache_wb = false;
+ wbc->unpinned_netfs_wb = false;
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
deleted file mode 100644
index b313a978ae0a..000000000000
--- a/fs/fscache/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config FSCACHE
- tristate "General filesystem local caching manager"
- select NETFS_SUPPORT
- help
- This option enables a generic filesystem caching manager that can be
- used by various network and other filesystems to cache data locally.
- Different sorts of caches can be plugged in, depending on the
- resources available.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_STATS
- bool "Gather statistical information on local caching"
- depends on FSCACHE && PROC_FS
- select NETFS_STATS
- help
- This option causes statistical information to be gathered on local
- caching and exported through file:
-
- /proc/fs/fscache/stats
-
- The gathering of statistics adds a certain amount of overhead to
- execution as there are a quite a few stats gathered, and on a
- multi-CPU system these may be on cachelines that keep bouncing
- between CPUs. On the other hand, the stats are very useful for
- debugging purposes. Saying 'Y' here is recommended.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_DEBUG
- bool "Debug FS-Cache"
- depends on FSCACHE
- help
- This permits debugging to be dynamically enabled in the local caching
- management module. If this is set, the debugging output may be
- enabled by setting bits in /sys/modules/fscache/parameter/debug.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
deleted file mode 100644
index afb090ea16c4..000000000000
--- a/fs/fscache/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for general filesystem caching code
-#
-
-fscache-y := \
- cache.o \
- cookie.o \
- io.o \
- main.o \
- volume.o
-
-fscache-$(CONFIG_PROC_FS) += proc.o
-fscache-$(CONFIG_FSCACHE_STATS) += stats.o
-
-obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
deleted file mode 100644
index 1336f517e9b1..000000000000
--- a/fs/fscache/internal.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Internal definitions for FS-Cache
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) "FS-Cache: " fmt
-
-#include <linux/slab.h>
-#include <linux/fscache-cache.h>
-#include <trace/events/fscache.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-
-/*
- * cache.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_caches_seq_ops;
-#endif
-bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
-void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
-
-static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
-{
- return smp_load_acquire(&cache->state);
-}
-
-static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
-{
- return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
-}
-
-static inline void fscache_set_cache_state(struct fscache_cache *cache,
- enum fscache_cache_state new_state)
-{
- smp_store_release(&cache->state, new_state);
-
-}
-
-static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
- enum fscache_cache_state old_state,
- enum fscache_cache_state new_state)
-{
- return try_cmpxchg_release(&cache->state, &old_state, new_state);
-}
-
-/*
- * cookie.c
- */
-extern struct kmem_cache *fscache_cookie_jar;
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_cookies_seq_ops;
-#endif
-extern struct timer_list fscache_cookie_lru_timer;
-
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-
-static inline void fscache_see_cookie(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
-{
- trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
- where);
-}
-
-/*
- * main.c
- */
-extern unsigned fscache_debug;
-
-extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
-
-/*
- * proc.c
- */
-#ifdef CONFIG_PROC_FS
-extern int __init fscache_proc_init(void);
-extern void fscache_proc_cleanup(void);
-#else
-#define fscache_proc_init() (0)
-#define fscache_proc_cleanup() do {} while (0)
-#endif
-
-/*
- * stats.c
- */
-#ifdef CONFIG_FSCACHE_STATS
-extern atomic_t fscache_n_volumes;
-extern atomic_t fscache_n_volumes_collision;
-extern atomic_t fscache_n_volumes_nomem;
-extern atomic_t fscache_n_cookies;
-extern atomic_t fscache_n_cookies_lru;
-extern atomic_t fscache_n_cookies_lru_expired;
-extern atomic_t fscache_n_cookies_lru_removed;
-extern atomic_t fscache_n_cookies_lru_dropped;
-
-extern atomic_t fscache_n_acquires;
-extern atomic_t fscache_n_acquires_ok;
-extern atomic_t fscache_n_acquires_oom;
-
-extern atomic_t fscache_n_invalidates;
-
-extern atomic_t fscache_n_relinquishes;
-extern atomic_t fscache_n_relinquishes_retire;
-extern atomic_t fscache_n_relinquishes_dropped;
-
-extern atomic_t fscache_n_resizes;
-extern atomic_t fscache_n_resizes_null;
-
-static inline void fscache_stat(atomic_t *stat)
-{
- atomic_inc(stat);
-}
-
-static inline void fscache_stat_d(atomic_t *stat)
-{
- atomic_dec(stat);
-}
-
-#define __fscache_stat(stat) (stat)
-
-int fscache_stats_show(struct seq_file *m, void *v);
-#else
-
-#define __fscache_stat(stat) (NULL)
-#define fscache_stat(stat) do {} while (0)
-#define fscache_stat_d(stat) do {} while (0)
-#endif
-
-/*
- * volume.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_volumes_seq_ops;
-#endif
-
-struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-bool fscache_begin_volume_access(struct fscache_volume *volume,
- struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-void fscache_create_volume(struct fscache_volume *volume, bool wait);
-
-
-/*****************************************************************************/
-/*
- * debug tracing
- */
-#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-
-#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-
-#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_FSCACHE_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (__do_kdebug(ENTER)) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (__do_kdebug(LEAVE)) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (__do_kdebug(DEBUG)) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
-/*
- * determine whether a particular optional debugging point should be logged
- * - we need to go through three steps to persuade cpp to correctly join the
- * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
- */
-#define ____do_kdebug(LEVEL, POINT) \
- unlikely((fscache_debug & \
- (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
-#define ___do_kdebug(LEVEL, POINT) \
- ____do_kdebug(LEVEL, POINT)
-#define __do_kdebug(POINT) \
- ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
-
-#define FSCACHE_DEBUG_CACHE 0
-#define FSCACHE_DEBUG_COOKIE 1
-#define FSCACHE_DEBUG_OBJECT 2
-#define FSCACHE_DEBUG_OPERATION 3
-
-#define FSCACHE_POINT_ENTER 1
-#define FSCACHE_POINT_LEAVE 2
-#define FSCACHE_POINT_DEBUG 4
-
-#ifndef FSCACHE_DEBUG_LEVEL
-#define FSCACHE_DEBUG_LEVEL CACHE
-#endif
-
-/*
- * assertions
- */
-#if 1 /* defined(__KDEBUGALL) */
-
-#define ASSERT(X) \
-do { \
- if (unlikely(!(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTCMP(X, OP, Y) \
-do { \
- if (unlikely(!((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIF(C, X) \
-do { \
- if (unlikely((C) && !(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIFCMP(C, X, OP, Y) \
-do { \
- if (unlikely((C) && !((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#else
-
-#define ASSERT(X) do {} while (0)
-#define ASSERTCMP(X, OP, Y) do {} while (0)
-#define ASSERTIF(C, X) do {} while (0)
-#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
-
-#endif /* assert or not */
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index b4db21022cb4..bec805e0c44c 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -21,3 +21,42 @@ config NETFS_STATS
multi-CPU system these may be on cachelines that keep bouncing
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+
+config FSCACHE
+ bool "General filesystem local caching manager"
+ depends on NETFS_SUPPORT
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ select NETFS_STATS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 386d6fb92793..d4d1d799819e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,11 +2,29 @@
netfs-y := \
buffered_read.o \
+ buffered_write.o \
+ direct_read.o \
+ direct_write.o \
io.o \
iterator.o \
+ locking.o \
main.o \
- objects.o
+ misc.o \
+ objects.o \
+ output.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
-obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
+netfs-$(CONFIG_FSCACHE) += \
+ fscache_cache.o \
+ fscache_cookie.o \
+ fscache_io.o \
+ fscache_main.o \
+ fscache_volume.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+netfs-$(CONFIG_FSCACHE) += fscache_proc.o
+endif
+netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) += netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c439..3298c29b5548 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -16,6 +16,7 @@
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
@@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
break;
}
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
folio_started = true;
}
@@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!pg_failed) {
flush_dcache_folio(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
+ if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
@@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
}
}
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq))
return;
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+ rreq->start, rreq->len);
+
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
@@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl)
;
netfs_begin_read(rreq, false);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
int ret;
- _enter("%lx", folio_index(folio));
+ _enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio)
goto alloc_error;
}
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto discard;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
- return netfs_begin_read(rreq, true);
+
+ /* Set up the output buffer */
+ if (folio_test_dirty(folio)) {
+ /* Handle someone trying to read from an unflushed streaming
+ * write. We fiddle the buffer so that a gap at the beginning
+ * and/or a gap at the end get copied to, but the middle is
+ * discarded.
+ */
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink)
+ goto discard;
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ } else {
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+ }
+
+ ret = netfs_begin_read(rreq, true);
+ if (sink)
+ folio_put(sink);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
@@ -387,14 +460,12 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio_index(folio);
+ rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
@@ -405,6 +476,10 @@ retry:
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
@@ -413,6 +488,7 @@ retry:
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
@@ -434,3 +510,124 @@ error:
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t flen = folio_size(folio);
+ int ret;
+
+ _enter("%zx @%llx", flen, start);
+
+ ret = -ENOMEM;
+
+ rreq = netfs_alloc_request(mapping, file, start, flen,
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+
+ rreq->no_unlock_folio = folio->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
+ ret = netfs_begin_read(rreq, true);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+ return -EINVAL;
+
+ ret = netfs_start_io_read(inode);
+ if (ret == 0) {
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 000000000000..a3059b3168fd
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,1254 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method. Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
+ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
+ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
+ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
+ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
+ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
+ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
+};
+
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+ if (netfs_group && !folio_get_private(folio))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
+/*
+ * Decide how we should modify a folio. We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it. If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+ struct file *file,
+ struct folio *folio,
+ void *netfs_group,
+ size_t flen,
+ size_t offset,
+ size_t len,
+ bool maybe_trouble)
+{
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ loff_t pos = folio_file_pos(folio);
+
+ _enter("");
+
+ if (netfs_folio_group(folio) != netfs_group)
+ return NETFS_FLUSH_CONTENT;
+
+ if (folio_test_uptodate(folio))
+ return NETFS_FOLIO_IS_UPTODATE;
+
+ if (pos >= ctx->zero_point)
+ return NETFS_MODIFY_AND_CLEAR;
+
+ if (!maybe_trouble && offset == 0 && len >= flen)
+ return NETFS_WHOLE_FOLIO_MODIFY;
+
+ if (file->f_mode & FMODE_READ)
+ goto no_write_streaming;
+ if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ goto no_write_streaming;
+
+ if (netfs_is_cache_enabled(ctx)) {
+ /* We don't want to get a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled.
+ */
+ if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+ goto no_write_streaming;
+ }
+
+ if (!finfo)
+ return NETFS_STREAMING_WRITE;
+
+ /* We can continue a streaming write only if it continues on from the
+ * previous. If it overlaps, we must flush lest we suffer a partial
+ * copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len)
+ return NETFS_STREAMING_WRITE_CONT;
+ return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ return NETFS_FLUSH_CONTENT;
+ }
+ return NETFS_JUST_PREFETCH;
+}
+
+/*
+ * Grab a folio for writing and lock it. Attempt to allocate as large a folio
+ * as possible to hold as much of the remaining length as possible in one go.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+ loff_t pos, size_t part)
+{
+ pgoff_t index = pos / PAGE_SIZE;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+ if (mapping_large_folio_support(mapping))
+ fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct netfs_inode *ctx = netfs_inode(inode);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .for_sync = true,
+ .nr_to_write = LONG_MAX,
+ .range_start = iocb->ki_pos,
+ .range_end = iocb->ki_pos + iter->count,
+ };
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_folio *finfo;
+ struct folio *folio;
+ enum netfs_how_to_modify howto;
+ enum netfs_folio_trace trace;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ ssize_t written = 0, ret;
+ loff_t i_size, pos = iocb->ki_pos, from, to;
+ size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ bool maybe_trouble = false;
+
+ if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+ iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ ) {
+ if (pos < i_size_read(inode)) {
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+ wreq = netfs_begin_writethrough(iocb, iter->count);
+ if (IS_ERR(wreq)) {
+ wbc_detach_inode(&wbc);
+ ret = PTR_ERR(wreq);
+ wreq = NULL;
+ goto out;
+ }
+ if (!is_sync_kiocb(iocb))
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_buffered_write;
+ }
+
+ do {
+ size_t flen;
+ size_t offset; /* Offset into pagecache folio */
+ size_t part; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
+ offset = pos & (max_chunk - 1);
+ part = min(max_chunk - offset, iov_iter_count(iter));
+
+ /* Bring in the user pages that we will copy from _first_ lest
+ * we hit a nasty deadlock on copying from the same page as
+ * we're writing to, without it being marked uptodate.
+ *
+ * Not only is this an optimisation, but it is also required to
+ * check that the address is actually valid, when atomic
+ * usercopies are used below.
+ *
+ * We rely on the page being held onto long enough by the LRU
+ * that we can grab it below if this causes it to be read.
+ */
+ ret = -EFAULT;
+ if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+ break;
+
+ folio = netfs_grab_folio_for_write(mapping, pos, part);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ flen = folio_size(folio);
+ offset = pos & (flen - 1);
+ part = min_t(size_t, flen - offset, part);
+
+ if (signal_pending(current)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
+ /* See if we need to prefetch the area we're going to modify.
+ * We need to do this before we get a lock on the folio in case
+ * there's more than one writer competing for the same cache
+ * block.
+ */
+ howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+ flen, offset, part, maybe_trouble);
+ _debug("howto %u", howto);
+ switch (howto) {
+ case NETFS_JUST_PREFETCH:
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ break;
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ case NETFS_STREAMING_WRITE_CONT:
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, 0, offset);
+ break;
+ case NETFS_STREAMING_WRITE:
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ break;
+ case NETFS_FLUSH_CONTENT:
+ trace_netfs_folio(folio, netfs_flush_content);
+ from = folio_pos(folio);
+ to = from + folio_size(folio) - 1;
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, from, to);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+ flush_dcache_folio(folio);
+
+ /* Deal with a (partially) failed copy */
+ if (copied == 0) {
+ ret = -EFAULT;
+ goto error_folio_unlock;
+ }
+
+ trace = (enum netfs_folio_trace)howto;
+ switch (howto) {
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_JUST_PREFETCH:
+ netfs_set_group(folio, netfs_group);
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, offset + copied, flen);
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ if (unlikely(copied < part)) {
+ maybe_trouble = true;
+ iov_iter_revert(iter, copied);
+ copied = 0;
+ goto retry;
+ }
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_STREAMING_WRITE:
+ if (offset == 0 && copied == flen) {
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace = netfs_streaming_filled_page;
+ break;
+ }
+ finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+ if (!finfo) {
+ iov_iter_revert(iter, copied);
+ ret = -ENOMEM;
+ goto error_folio_unlock;
+ }
+ finfo->netfs_group = netfs_get_group(netfs_group);
+ finfo->dirty_offset = offset;
+ finfo->dirty_len = copied;
+ folio_attach_private(folio, (void *)((unsigned long)finfo |
+ NETFS_FOLIO_INFO));
+ break;
+ case NETFS_STREAMING_WRITE_CONT:
+ finfo = netfs_folio_info(folio);
+ finfo->dirty_len += copied;
+ if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ folio_mark_uptodate(folio);
+ kfree(finfo);
+ trace = netfs_streaming_cont_filled_page;
+ }
+ break;
+ default:
+ WARN(true, "Unexpected modify type %u ix=%lx\n",
+ howto, folio->index);
+ ret = -EIO;
+ goto error_folio_unlock;
+ }
+
+ trace_netfs_folio(folio, trace);
+
+ /* Update the inode size if we moved the EOF marker */
+ i_size = i_size_read(inode);
+ pos += copied;
+ if (pos > i_size) {
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ } else {
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+ }
+ }
+ written += copied;
+
+ if (likely(!wreq)) {
+ folio_mark_dirty(folio);
+ } else {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+ /* We make multiple writes to the folio... */
+ if (!folio_test_writeback(folio)) {
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
+ folio_start_fscache(folio);
+ if (wreq->iter.count == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ }
+ netfs_advance_writethrough(wreq, copied,
+ offset + copied == flen);
+ }
+ retry:
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+
+ cond_resched();
+ } while (iov_iter_count(iter));
+
+out:
+ if (unlikely(wreq)) {
+ ret = netfs_end_writethrough(wreq, iocb);
+ wbc_detach_inode(&wbc);
+ if (ret == -EIOCBQUEUED)
+ return ret;
+ }
+
+ iocb->ki_pos += written;
+ _leave(" = %zd [%zd]", written, ret);
+ return written ? written : ret;
+
+error_folio_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already. The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ trace_netfs_write_iter(iocb, from);
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_write_iter(iocb, from);
+
+ ret = netfs_start_io_write(inode);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+ netfs_end_io_write(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+ vm_fault_t ret = VM_FAULT_RETRY;
+ int err;
+
+ _enter("%lx", folio->index);
+
+ sb_start_pagefault(inode->i_sb);
+
+ if (folio_wait_writeback_killable(folio))
+ goto out;
+
+ if (folio_lock_killable(folio) < 0)
+ goto out;
+
+ /* Can we see a streaming write here? */
+ if (WARN_ON(!folio_test_uptodate(folio))) {
+ ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+ goto out;
+ }
+
+ if (netfs_folio_group(folio) != netfs_group) {
+ folio_unlock(folio);
+ err = filemap_fdatawait_range(inode->i_mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
+ switch (err) {
+ case 0:
+ ret = VM_FAULT_RETRY;
+ goto out;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ goto out;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
+ if (folio_test_dirty(folio))
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+ netfs_set_group(folio, netfs_group);
+ file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("kill %lx (to %lx)", index, last);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+
+ trace_netfs_folio(folio, netfs_folio_trace_kill);
+ folio_clear_uptodate(folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_folio(mapping, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ } while (index = next, index <= last);
+
+ _leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("redirty %llx @%llx", len, start);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+ trace_netfs_folio(folio, netfs_folio_trace_redirty);
+ filemap_dirty_folio(mapping, folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ _leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ struct folio *folio;
+ pgoff_t last;
+ int gcount = 0;
+
+ XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+ rcu_read_lock();
+
+ last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, last) {
+ WARN(!folio_test_writeback(folio),
+ "bad %zx @%llx page %lx %lx\n",
+ wreq->len, wreq->start, folio->index, last);
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under
+ * writeback, so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+ kfree(finfo);
+ } else if ((group = netfs_folio_group(folio))) {
+ /* Need to detach the group pointer if the page didn't
+ * get redirtied. If it has been redirtied, then it
+ * must be within the same group.
+ */
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ goto end_wb;
+ }
+ if (folio_trylock(folio)) {
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ goto end_wb;
+ }
+
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ rcu_read_lock();
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_clear);
+ }
+ end_wb:
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ xas_advance(&xas, folio_next_index(folio) - 1);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+ netfs_put_group_many(group, gcount);
+ _leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+
+ _enter("");
+
+ switch (wreq->error) {
+ case 0:
+ netfs_pages_written_back(wreq);
+ break;
+
+ default:
+ pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+ fallthrough;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ case -ENETRESET:
+ case -EDQUOT:
+ case -ENOSPC:
+ netfs_redirty_pages(mapping, wreq->start, wreq->len);
+ break;
+
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ netfs_kill_pages(mapping, wreq->start, wreq->len);
+ break;
+ }
+
+ if (wreq->error)
+ mapping_set_error(mapping, wreq->error);
+ if (wreq->netfs_ops->done)
+ wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool caching,
+ size_t *_len,
+ size_t *_top)
+{
+ struct netfs_folio *finfo;
+ struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned int i;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
+ void *priv;
+ bool stop = true;
+
+ folio_batch_init(&fbatch);
+
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
+
+ xas_for_each(xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio->index != index) {
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Has the folio moved or been split? */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ stop = false;
+ len = folio_size(folio);
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ stop = true;
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group ||
+ finfo->dirty_offset > 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ len = finfo->dirty_len;
+ }
+
+ *_top += folio_size(folio);
+ index += folio_nr_pages(folio);
+ *_count -= folio_nr_pages(folio);
+ *_len += len;
+ if (*_len >= max_len || *_count <= 0)
+ stop = true;
+
+ if (!folio_batch_add(&fbatch, folio))
+ break;
+ if (stop)
+ break;
+ }
+
+ xas_pause(xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any folios, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!folio_batch_count(&fbatch))
+ break;
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+ folio_unlock(folio);
+ }
+
+ folio_batch_release(&fbatch);
+ cond_resched();
+ } while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ struct folio *folio,
+ unsigned long long start,
+ unsigned long long end)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ size_t len, max_len;
+ bool caching = netfs_is_cache_enabled(ctx);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+ NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ folio_unlock(folio);
+ return PTR_ERR(wreq);
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+
+ count -= folio_nr_pages(folio);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+
+ len = wreq->len;
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ start += finfo->dirty_offset;
+ if (finfo->dirty_offset + finfo->dirty_len != len) {
+ len = finfo->dirty_len;
+ goto cant_expand;
+ }
+ len = finfo->dirty_len;
+ }
+
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len)
+ netfs_extend_writeback(mapping, group, xas, &count, start,
+ max_len, caching, &len, &wreq->upper_len);
+ }
+
+cant_expand:
+ len = min_t(unsigned long long, len, i_size - start);
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+ wreq->start = start;
+ wreq->len = len;
+
+ if (start < i_size) {
+ _debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_buffered_write;
+
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+ wreq->upper_len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+ if (ret == 0 || ret == -EIOCBQUEUED)
+ wbc->nr_to_write -= len / PAGE_SIZE;
+ } else {
+ _debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+ /* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
+ netfs_pages_written_back(wreq);
+ ret = 0;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = 1");
+ return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ const struct netfs_folio *finfo;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
+ void *priv;
+ int skips = 0;
+
+ _enter("%llx,%llx,", start, end);
+
+search_again:
+ /* Find the first dirty page in the group. */
+ rcu_read_lock();
+
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
+ break;
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Skip any dirty folio that's not in the group of interest. */
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group) {
+ folio_put(folio);
+ continue;
+ }
+ }
+
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
+
+ start = folio_pos(folio); /* May regress with THPs */
+
+ _debug("wback %lx", folio->index);
+
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
+
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ goto lock_again;
+ }
+
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched()) {
+ ret = 0;
+ goto out;
+ }
+ skips++;
+ }
+ goto search_again;
+ }
+
+ ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+ folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ _leave(" = %zd [%llx]", ret, *_start);
+ return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+ _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_group *group = NULL;
+ loff_t start, end;
+ int ret;
+
+ _enter("");
+
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+
+ if (wbc->range_cyclic && mapping->writeback_index) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
+ }
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ start = 0;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else {
+ start = wbc->range_start;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, wbc->range_end);
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+ if (wreq->error) {
+ pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+ mapping_set_error(wreq->mapping, wreq->error);
+ }
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down. Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+ struct netfs_io_request *wreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
+ struct bio_vec bvec;
+ unsigned long long i_size = i_size_read(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t offset = 0, len;
+ int ret = 0;
+
+ if (finfo) {
+ offset = finfo->dirty_offset;
+ start += offset;
+ len = finfo->dirty_len;
+ } else {
+ len = folio_size(folio);
+ }
+ len = min_t(unsigned long long, len, i_size - start);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto out;
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ goto out_put;
+
+ trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+ _debug("launder %llx-%llx", start, start + len - 1);
+
+ /* Speculatively write to the cache. We have to fix this up later if
+ * the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_launder_folio;
+
+ bvec_set_folio(&bvec, folio, len, offset);
+ iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+ folio_detach_private(folio);
+ netfs_put_group(group);
+ kfree(finfo);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+ folio_wait_fscache(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 000000000000..ad4370b3935d
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ ssize_t ret;
+ size_t orig_count = iov_iter_count(iter);
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ if (!orig_count)
+ return 0; /* Don't update atime */
+
+ ret = kiocb_write_and_wait(iocb, orig_count);
+ if (ret < 0)
+ return ret;
+ file_accessed(iocb->ki_filp);
+
+ rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, orig_count,
+ NETFS_DIO_READ);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ netfs_stat(&netfs_n_rh_dio_read);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+ /* If this is an async op, we have to keep track of the destination
+ * buffer for ourselves as the caller's iterator will be trashed when
+ * we return.
+ *
+ * In such a case, extract an iterator to represent as much of the the
+ * output buffer as we can manage. Note that the extraction might not
+ * be able to allocate a sufficiently large bvec array and may shorten
+ * the request.
+ */
+ if (user_backed_iter(iter)) {
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ if (ret < 0)
+ goto out;
+ rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv_count = ret;
+ rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ rreq->len = iov_iter_count(&rreq->iter);
+ } else {
+ rreq->iter = *iter;
+ rreq->len = orig_count;
+ rreq->direct_bv_unpin = false;
+ iov_iter_advance(iter, orig_count);
+ }
+
+ // TODO: Set up bounce buffer if needed
+
+ if (async)
+ rreq->iocb = iocb;
+
+ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ if (ret < 0)
+ goto out; /* May be -EIOCBQUEUED */
+ if (!async) {
+ // TODO: Copy from bounce buffer
+ iocb->ki_pos += rreq->transferred;
+ ret = rreq->transferred;
+ }
+
+out:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ if (ret > 0)
+ orig_count -= ret;
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (!iter->count)
+ return 0; /* Don't update atime */
+
+ ret = netfs_start_io_direct(inode);
+ if (ret == 0) {
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ netfs_end_io_direct(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 000000000000..60a40d293c87
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+ struct inode *inode = wreq->inode;
+ unsigned long long end = wreq->start + wreq->len;
+
+ if (!wreq->error &&
+ i_size_read(inode) < end) {
+ if (wreq->netfs_ops->update_i_size)
+ wreq->netfs_ops->update_i_size(inode, end);
+ else
+ i_size_write(inode, end);
+ }
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file. This can also be used for direct I/O writes.
+ */
+static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct netfs_io_request *wreq;
+ unsigned long long start = iocb->ki_pos;
+ unsigned long long end = start + iov_iter_count(iter);
+ ssize_t ret, n;
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ /* We're going to need a bounce buffer if what we transmit is going to
+ * be different in some way to the source buffer, e.g. because it gets
+ * encrypted/compressed or because it needs expanding to a block size.
+ */
+ // TODO
+
+ _debug("uw %llx-%llx", start, end);
+
+ wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ start, end - start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ if (IS_ERR(wreq))
+ return PTR_ERR(wreq);
+
+ {
+ /* If this is an async op and we're not using a bounce buffer,
+ * we have to save the source buffer as the iterator is only
+ * good until we return. In such a case, extract an iterator
+ * to represent as much of the the output buffer as we can
+ * manage. Note that the extraction might not be able to
+ * allocate a sufficiently large bvec array and may shorten the
+ * request.
+ */
+ if (async || user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (n < 0) {
+ ret = n;
+ goto out;
+ }
+ wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv_count = n;
+ wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ wreq->len = iov_iter_count(&wreq->iter);
+ } else {
+ wreq->iter = *iter;
+ }
+
+ wreq->io_iter = wreq->iter;
+ }
+
+ /* Copy the data into the bounce buffer and encrypt it. */
+ // TODO
+
+ /* Dispatch the write. */
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ if (async)
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_dio_write;
+ ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+ iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write);
+ if (ret < 0) {
+ _debug("begin = %zd", ret);
+ goto out;
+ }
+
+ if (!async) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = wreq->error;
+ _debug("waited = %zd", ret);
+ if (ret == 0) {
+ ret = wreq->transferred;
+ iocb->ki_pos += ret;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long end;
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ trace_netfs_write_iter(iocb, from);
+ netfs_stat(&netfs_n_rh_dio_write);
+
+ ret = netfs_start_io_direct(inode);
+ if (ret < 0)
+ return ret;
+ ret = generic_write_checks(iocb, from);
+ if (ret < 0)
+ goto out;
+ ret = file_remove_privs(file);
+ if (ret < 0)
+ goto out;
+ ret = file_update_time(file);
+ if (ret < 0)
+ goto out;
+ ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (ret < 0)
+ goto out;
+ end = iocb->ki_pos + iov_iter_count(from);
+ if (end > ictx->zero_point)
+ ictx->zero_point = end;
+
+ fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+ netfs_end_io_direct(inode);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c
index d645f8b302a2..9397ed39b0b4 100644
--- a/fs/fscache/cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
void fscache_put_cache(struct fscache_cache *cache,
enum fscache_cache_trace where)
{
- unsigned int debug_id = cache->debug_id;
+ unsigned int debug_id;
bool zero;
int ref;
if (IS_ERR_OR_NULL(cache))
return;
+ debug_id = cache->debug_id;
zero = __refcount_dec_and_test(&cache->ref, &ref);
trace_fscache_cache(debug_id, ref - 1, where);
diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/netfs/fscache_cookie.c
diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h
new file mode 100644
index 000000000000..a09b948fcef2
--- /dev/null
+++ b/fs/netfs/fscache_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include "internal.h"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f82..ad572f7ee897 100644
--- a/fs/fscache/io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
}
EXPORT_SYMBOL(__fscache_begin_write_operation);
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
- struct fscache_cookie *cookie)
-{
- struct inode *inode = mapping->host;
- bool need_use = false;
-
- _enter("");
-
- if (!filemap_dirty_folio(mapping, folio))
- return false;
- if (!fscache_cookie_valid(cookie))
- return true;
-
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- inode->i_state |= I_PINNING_FSCACHE_WB;
- need_use = true;
- }
- spin_unlock(&inode->i_lock);
-
- if (need_use)
- fscache_use_cookie(cookie, true);
- }
- return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
struct address_space *mapping;
@@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
fscache_access_io_write) < 0)
goto abandon_free;
- ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
if (ret < 0)
goto abandon_end;
diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c
index dad85fd84f6f..42e98bb523e3 100644
--- a/fs/fscache/main.c
+++ b/fs/netfs/fscache_main.c
@@ -8,18 +8,9 @@
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#define CREATE_TRACE_POINTS
#include "internal.h"
-
-MODULE_DESCRIPTION("FS Cache Manager");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned fscache_debug;
-module_param_named(debug, fscache_debug, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_debug,
- "FS-Cache debugging mask");
+#define CREATE_TRACE_POINTS
+#include <trace/events/fscache.h>
EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
@@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
/*
* initialise the fs caching module
*/
-static int __init fscache_init(void)
+int __init fscache_init(void)
{
int ret = -ENOMEM;
@@ -92,7 +83,7 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- pr_notice("Loaded\n");
+ pr_notice("FS-Cache loaded\n");
return 0;
error_cookie_jar:
@@ -103,19 +94,15 @@ error_wq:
return ret;
}
-fs_initcall(fscache_init);
-
/*
* clean up on module removal
*/
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
{
_enter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
destroy_workqueue(fscache_wq);
- pr_notice("Unloaded\n");
+ pr_notice("FS-Cache unloaded\n");
}
-
-module_exit(fscache_exit);
diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8cce..874d951bc390 100644
--- a/fs/fscache/proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
#include "internal.h"
/*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
*/
int __init fscache_proc_init(void)
{
- if (!proc_mkdir("fs/fscache", NULL))
- goto error_dir;
+ if (!proc_symlink("fs/fscache", NULL, "netfs"))
+ goto error_sym;
- if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
&fscache_caches_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
&fscache_volumes_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error;
-#endif
-
return 0;
error:
remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
return -ENOMEM;
}
/*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
*/
void fscache_proc_cleanup(void)
{
diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f1c..add21abdf713 100644
--- a/fs/fscache/stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
EXPORT_SYMBOL(fscache_n_no_create_space);
atomic_t fscache_n_culled;
EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
/*
* display the general statistics
*/
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
{
- seq_puts(m, "FS-Cache statistics\n");
+ seq_puts(m, "-- FS-Cache statistics --\n");
seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
atomic_read(&fscache_n_cookies),
atomic_read(&fscache_n_volumes),
@@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_no_create_space),
atomic_read(&fscache_n_culled));
- seq_printf(m, "IO : rd=%u wr=%u\n",
+ seq_printf(m, "IO : rd=%u wr=%u mis=%u\n",
atomic_read(&fscache_n_read),
- atomic_read(&fscache_n_write));
-
- netfs_stats_show(m);
+ atomic_read(&fscache_n_write),
+ atomic_read(&fscache_n_dio_misfit));
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..cdf991bdd9de 100644
--- a/fs/fscache/volume.c
+++ b/fs/netfs/fscache_volume.c
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43fac1b14e40..ec7045d24400 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,9 +5,13 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
#include <trace/events/netfs.h>
+#include <trace/events/fscache.h>
#ifdef pr_fmt
#undef pr_fmt
@@ -19,6 +23,8 @@
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len);
/*
* io.c
@@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
* main.c
*/
extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+ spin_lock(&netfs_proc_lock);
+ list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+ spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+ if (!list_empty(&rreq->proc_link)) {
+ spin_lock(&netfs_proc_lock);
+ list_del_rcu(&rreq->proc_link);
+ spin_unlock(&netfs_proc_lock);
+ }
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
+
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
+extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_readpage;
extern atomic_t netfs_n_rh_rreq;
@@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
+int netfs_stats_show(struct seq_file *m, void *v);
static inline void netfs_stat(atomic_t *stat)
{
@@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
#endif
}
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group)
+ refcount_inc(&netfs_group->ref);
+ return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+ if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * fscache-cache.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
+
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
+
+/*
+ * fscache-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
+
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
+/*
+ * fscache-main.c
+ */
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
+
+/*
+ * fscache-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fscache-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
+
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+int fscache_stats_show(struct seq_file *m);
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
+#endif
+
+/*
+ * fscache-volume.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
+
/*****************************************************************************/
/*
* debug tracing
@@ -143,3 +373,57 @@ do { \
#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e047..e8ff1e61ce79 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
@@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
+
+ if (rreq->origin != NETFS_DIO_READ &&
+ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
@@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && folio_index(folio) <= unlocked)
+ if (have_unlocked && folio->index <= unlocked)
continue;
- unlocked = folio_index(folio);
+ unlocked = folio_next_index(folio) - 1;
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_fscache(folio);
have_unlocked = true;
}
@@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
+ subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
@@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
}
/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ size_t remaining = subreq->len - subreq->transferred;
+ size_t count = iov_iter_count(&subreq->io_iter);
+
+ if (count == remaining)
+ return;
+
+ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->transferred,
+ subreq->len, rreq->i_size,
+ subreq->io_iter.iter_type);
+
+ if (count < remaining)
+ iov_iter_revert(&subreq->io_iter, remaining - count);
+ else
+ iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
+/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
@@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
+ netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
netfs_rreq_short_read(rreq, subreq);
@@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
}
/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+ size_t transferred = 0;
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ rreq->transferred = transferred;
+ task_io_account_read(transferred);
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ inode_dio_end(rreq->inode);
+}
+
+/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
@@ -340,8 +400,12 @@ again:
return;
}
- netfs_rreq_unlock_folios(rreq);
+ if (rreq->origin != NETFS_DIO_READ)
+ netfs_rreq_unlock_folios(rreq);
+ else
+ netfs_rreq_assess_dio(rreq);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
+ _enter("R=%x[%x]{%llx,%lx},%zd",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *io_iter)
{
- enum netfs_io_source source;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
+ if (rreq->origin != NETFS_DIO_READ) {
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+ }
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
@@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
+ if (rreq->origin != NETFS_DIO_READ) {
+ if (subreq->start >= ictx->zero_point) {
+ source = NETFS_FILL_WITH_ZEROES;
+ goto set;
+ }
+ if (subreq->len > ictx->zero_point - subreq->start)
+ subreq->len = ictx->zero_point - subreq->start;
+ }
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
+ if (rreq->rsize && subreq->len > rreq->rsize)
+ subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
+
+ if (subreq->max_nr_segs) {
+ lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+ subreq->max_nr_segs);
+ if (subreq->len > lsize) {
+ subreq->len = lsize;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
}
- if (WARN_ON(subreq->len == 0))
+set:
+ if (subreq->len > rreq->len)
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, rreq->len);
+
+ if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
+ goto out;
+ }
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = *io_iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +643,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ struct iov_iter *io_iter,
unsigned int *_debug_index)
{
struct netfs_io_subrequest *subreq;
@@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
+ subreq->len = io_iter->count;
_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
- source = netfs_rreq_prepare_read(rreq, subreq);
+ source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
@@ -603,6 +706,7 @@ subreq_failed:
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
+ struct iov_iter io_iter;
unsigned int debug_index = 0;
int ret;
@@ -611,50 +715,71 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
return -EIO;
}
- INIT_WORK(&rreq->work, netfs_rreq_work);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_begin(rreq->inode);
- if (sync)
- netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+ // TODO: Use bounce buffer if requested
+ rreq->io_iter = rreq->iter;
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
+ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
+ io_iter = rreq->io_iter;
do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ _debug("submit %llx + %zx >= %llx",
+ rreq->start, rreq->submitted, rreq->i_size);
+ if (rreq->origin == NETFS_DIO_READ &&
+ rreq->start + rreq->submitted >= rreq->i_size)
+ break;
+ if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ ret = 0;
+ goto out;
+ }
+
if (sync) {
- /* Keep nr_outstanding incremented so that the ref always belongs to
- * us, and the service code isn't punted off to a random thread pool to
- * process.
+ /* Keep nr_outstanding incremented so that the ref always
+ * belongs to us, and the service code isn't punted off to a
+ * random thread pool to process. Note that this might start
+ * further work, such as writing to the cache.
*/
- for (;;) {
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
- netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- ret = 0;
+ ret = -EIOCBQUEUED;
}
+
+out:
return ret;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655a0..b781bbbf1d8d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments. Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ struct folio *folio;
+ unsigned int nsegs = 0;
+ loff_t pos = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t span = 0, n = iter->count;
+
+ XA_STATE(xas, iter->xarray, index);
+
+ if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = min(max_size, n - start_offset);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t offset, flen, len;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ flen = folio_size(folio);
+ offset = offset_in_folio(folio, pos);
+ len = min(max_size, flen - offset);
+ span += len;
+ nsegs++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ }
+
+ rcu_read_unlock();
+ return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_xarray(iter))
+ return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 000000000000..75dc52a49b3a
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+ if (!atomic_read(&inode->i_dio_count))
+ return 0;
+
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ for (;;) {
+ prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+ if (!atomic_read(&inode->i_dio_count))
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+ finish_wait(wq, &q.wq_entry);
+
+ return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+ return 0;
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+ set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ if (inode->i_mapping->nrpages != 0) {
+ unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0) {
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ ret = netfs_block_buffered(inode);
+ if (ret < 0) {
+ up_write(&inode->i_rwsem);
+ return ret;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 068568702957..5e77618a7940 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/netfs.h>
@@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
+
unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[nr__netfs_io_origin] = {
+ [NETFS_READAHEAD] = "RA",
+ [NETFS_READPAGE] = "RP",
+ [NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITETHROUGH] = "WT",
+ [NETFS_LAUNDER_WRITE] = "LW",
+ [NETFS_UNBUFFERED_WRITE] = "UW",
+ [NETFS_DIO_READ] = "DR",
+ [NETFS_DIO_WRITE] = "DW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+ struct netfs_io_request *rreq;
+
+ if (v == &netfs_io_requests) {
+ seq_puts(m,
+ "REQUEST OR REF FL ERR OPS COVERAGE\n"
+ "======== == === == ==== === =========\n"
+ );
+ return 0;
+ }
+
+ rreq = list_entry(v, struct netfs_io_request, proc_link);
+ seq_printf(m,
+ "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ rreq->debug_id,
+ netfs_origins[rreq->origin],
+ refcount_read(&rreq->ref),
+ rreq->flags,
+ rreq->error,
+ atomic_read(&rreq->nr_outstanding),
+ rreq->start, rreq->submitted, rreq->len);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+ .start = netfs_requests_seq_start,
+ .next = netfs_requests_seq_next,
+ .stop = netfs_requests_seq_stop,
+ .show = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __init netfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (!proc_mkdir("fs/netfs", NULL))
+ goto error;
+ if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+ &netfs_requests_seq_ops))
+ goto error_proc;
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+ netfs_stats_show))
+ goto error_proc;
+#endif
+
+ ret = fscache_init();
+ if (ret < 0)
+ goto error_proc;
+ return 0;
+
+error_proc:
+ remove_proc_entry("fs/netfs", NULL);
+error:
+ return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+ fscache_exit();
+ remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 000000000000..90051ced8e2a
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask)
+{
+ XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+ xas_lock(&xas);
+ for (;;) {
+ xas_store(&xas, folio);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock(&xas);
+ if (!xas_nomem(&xas, gfp_mask))
+ return xas_error(&xas);
+ goto retry;
+ }
+
+ if (flags & NETFS_FLAG_PUT_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+ if (flags & NETFS_FLAG_PAGECACHE_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+ xas_unlock(&xas);
+ return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+ struct folio *folio;
+ int ret;
+
+ if (to + 1 == index) /* Page range is inclusive */
+ return 0;
+
+ do {
+ /* TODO: Figure out what order folio can be allocated here */
+ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+ folio->index = index;
+ ret = netfs_xa_store_and_mark(buffer, index, folio,
+ NETFS_FLAG_PUT_MARK, gfp_mask);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+
+ index += folio_nr_pages(folio);
+ } while (index <= to && index != 0);
+
+ return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it. This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio(). This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (wbc->unpinned_netfs_wb)
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (inode->i_state & I_PINNING_NETFS_WB) {
+ loff_t i_size = i_size_read(inode);
+ fscache_unuse_cookie(cookie, aux, &i_size);
+ }
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem. The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ struct netfs_folio *finfo = NULL;
+ size_t flen = folio_size(folio);
+
+ _enter("{%lx},%zx,%zx", folio->index, offset, length);
+
+ folio_wait_fscache(folio);
+
+ if (!folio_test_private(folio))
+ return;
+
+ finfo = netfs_folio_info(folio);
+
+ if (offset == 0 && length >= flen)
+ goto erase_completely;
+
+ if (finfo) {
+ /* We have a partially uptodate page from a streaming write. */
+ unsigned int fstart = finfo->dirty_offset;
+ unsigned int fend = fstart + finfo->dirty_len;
+ unsigned int end = offset + length;
+
+ if (offset >= fend)
+ return;
+ if (end <= fstart)
+ return;
+ if (offset <= fstart && end >= fend)
+ goto erase_completely;
+ if (offset <= fstart && end > fstart)
+ goto reduce_len;
+ if (offset > fstart && end >= fend)
+ goto move_start;
+ /* A partial write was split. The caller has already zeroed
+ * it, so just absorb the hole.
+ */
+ }
+ return;
+
+erase_completely:
+ netfs_put_group(netfs_folio_group(folio));
+ folio_detach_private(folio);
+ folio_clear_uptodate(folio);
+ kfree(finfo);
+ return;
+reduce_len:
+ finfo->dirty_len = offset + length - finfo->dirty_offset;
+ return;
+move_start:
+ finfo->dirty_len -= offset - finfo->dirty_offset;
+ finfo->dirty_offset = offset;
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ unsigned long long end;
+
+ end = folio_pos(folio) + folio_size(folio);
+ if (end > ctx->zero_point)
+ ctx->zero_point = end;
+
+ if (folio_test_private(folio))
+ return false;
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+
+ fscache_note_page_release(netfs_i_cookie(ctx));
+ return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6a7..610ceb5bd86c 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+ origin == NETFS_DIO_READ ||
+ origin == NETFS_DIO_WRITE);
+ bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+ GFP_KERNEL);
if (!rreq)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
+ rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (cached)
+ __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (file && file->f_flags & O_NONBLOCK)
+ __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
@@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ if (rreq->direct_bv) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ if (rreq->direct_bv[i].bv_page) {
+ if (rreq->direct_bv_unpin)
+ unpin_user_page(rreq->direct_bv[i].bv_page);
+ }
+ }
+ kvfree(rreq->direct_bv);
+ }
+ kfree_rcu(rreq, rcu);
netfs_stat_d(&netfs_n_rh_rreq);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
- unsigned int debug_id = rreq->debug_id;
+ unsigned int debug_id;
bool dead;
int r;
- dead = __refcount_dec_and_test(&rreq->ref, &r);
- trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_request(&rreq->work);
+ if (rreq) {
+ debug_id = rreq->debug_id;
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
}
}
}
@@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
{
struct netfs_io_subrequest *subreq;
- subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+ sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
if (subreq) {
+ INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
@@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ if (rreq->netfs_ops->free_subrequest)
+ rreq->netfs_ops->free_subrequest(subreq);
kfree(subreq);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 000000000000..625eb68f3e5a
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+ enum netfs_io_source dest,
+ loff_t start, size_t len,
+ work_func_t worker)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ if (subreq) {
+ INIT_WORK(&subreq->work, worker);
+ subreq->source = dest;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->debug_index = wreq->subreq_counter++;
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ BUG();
+ }
+
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ atomic_inc(&wreq->nr_outstanding);
+ list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ size_t transferred = 0;
+
+ _enter("R=%x[]", wreq->debug_id);
+
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+ wreq->transferred = transferred;
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (!subreq->error)
+ continue;
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ /* Depending on the type of failure, this may prevent
+ * writeback completion unless we're in disconnected
+ * mode.
+ */
+ if (!wreq->error)
+ wreq->error = subreq->error;
+ break;
+
+ case NETFS_WRITE_TO_CACHE:
+ /* Failure doesn't prevent writeback completion unless
+ * we're in disconnected mode.
+ */
+ if (subreq->error != -ENOBUFS)
+ ctx->ops->invalidate_cache(wreq);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ if (!wreq->error)
+ wreq->error = -EIO;
+ return;
+ }
+ }
+
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : transferred);
+ }
+
+ netfs_clear_subrequests(wreq, was_async);
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ unsigned int u;
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(wreq, subreq, transferred_or_error,
+ netfs_fail_write);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+ wreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->io_iter.iter_type);
+
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&wreq->nr_outstanding);
+ if (u == 0)
+ netfs_write_terminated(wreq, was_async);
+ else if (u == 1)
+ wake_up_var(&wreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ goto out;
+
+failed:
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+ wreq->error = subreq->error;
+ break;
+ default:
+ break;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+ cres->ops->write(cres, subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread. We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+ loff_t start = wreq->start;
+ size_t len = wreq->len;
+ int ret;
+
+ if (!fscache_cookie_enabled(cookie)) {
+ clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+ return;
+ }
+
+ _debug("write to cache");
+ ret = fscache_begin_write_operation(cres, cookie);
+ if (ret < 0)
+ return;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+ i_size_read(wreq->inode), true);
+ if (ret < 0)
+ return;
+
+ subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+ netfs_write_to_cache_op_worker);
+ if (!subreq)
+ return;
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present. The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ * data into it, otherwise the output buffer will point to the original
+ * folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ * buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ * (a) If the data is to be compressed, create a compression buffer and
+ * compress the data into it.
+ *
+ * (b) For each destination we want to upload to, set up write ops to write
+ * to that destination. We may need multiple writes if the data is not
+ * contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what)
+{
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+ _enter("R=%x %llx-%llx f=%lx",
+ wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+ wreq->flags);
+
+ trace_netfs_write(wreq, what);
+ if (wreq->len == 0 || wreq->iter.count == 0) {
+ pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+ return -EIO;
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+
+ /* Start the encryption/compression going. We can do that in the
+ * background whilst we generate a list of write ops that we want to
+ * perform.
+ */
+ // TODO: Encrypt or compress the region as appropriate
+
+ /* We need to write all of the region to the cache */
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ netfs_set_up_write_to_cache(wreq);
+
+ /* However, we don't necessarily write all of the region to the server.
+ * Caching of reads is being managed this way also.
+ */
+ if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (!may_wait)
+ return -EIOCBQUEUED;
+
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ return wreq->error;
+}
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq;
+ struct file *file = iocb->ki_filp;
+
+ wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+ NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+ return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ unsigned long long start;
+ size_t len;
+
+ if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ return;
+
+ start = wreq->start + wreq->submitted;
+ len = wreq->iter.count - wreq->submitted;
+ if (!final) {
+ len /= wreq->wsize; /* Round to number of maximum packets */
+ len *= wreq->wsize;
+ }
+
+ ictx->ops->create_write_requests(wreq, start, len);
+ wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+ _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+ wreq->iter.count += copied;
+ wreq->io_iter.count += copied;
+ if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+ netfs_submit_writethrough(wreq, false);
+
+ return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+ int ret = -EIOCBQUEUED;
+
+ _enter("ic=%zu sb=%zu ws=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize);
+
+ if (wreq->submitted < wreq->io_iter.count)
+ netfs_submit_writethrough(wreq, true);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (is_sync_kiocb(iocb)) {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a40..deeba9f9dcf5 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,8 @@
#include <linux/seq_file.h>
#include "internal.h"
+atomic_t netfs_n_rh_dio_read;
+atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_readpage;
atomic_t netfs_n_rh_rreq;
@@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ atomic_read(&netfs_n_rh_dio_read),
+ atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_readpage),
atomic_read(&netfs_n_rh_write_begin),
- atomic_read(&netfs_n_rh_write_zskip),
- atomic_read(&netfs_n_rh_rreq),
- atomic_read(&netfs_n_rh_sreq));
- seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
- atomic_read(&netfs_n_rh_write),
- atomic_read(&netfs_n_rh_write_done),
- atomic_read(&netfs_n_rh_write_failed));
+ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ atomic_read(&netfs_n_wh_upload),
+ atomic_read(&netfs_n_wh_upload_done),
+ atomic_read(&netfs_n_wh_upload_failed));
+ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_wh_write),
+ atomic_read(&netfs_n_wh_write_done),
+ atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_wh_wstream_conflict));
+ return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 01ac733a6320..f7e32d76e34d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -169,8 +169,8 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
- depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
- select NETFS_SUPPORT
+ depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+ select FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b05717fe0d4e..2d1bfee225c3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
put_nfs_open_context(rreq->netfs_priv);
}
-static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return fscache_begin_read_operation(&rreq->cache_resources,
- netfs_i_cookie(netfs_inode(rreq->inode)));
-}
-
static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
- .begin_cache_operation = nfs_netfs_begin_cache_operation,
.issue_read = nfs_netfs_issue_read,
.clamp_length = nfs_netfs_clamp_length
};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 5407ab8c8783..e3cb4923316b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
- netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2fa54cfd4882..6dc6340e2852 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7911,14 +7911,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock *fl;
int status = false;
- struct nfsd_file *nf = find_any_file(fp);
+ struct nfsd_file *nf;
struct inode *inode;
struct file_lock_context *flctx;
+ spin_lock(&fp->fi_lock);
+ nf = find_any_file_locked(fp);
if (!nf) {
/* Any valid lock stateid should have some sort of access */
WARN_ON_ONCE(1);
- return status;
+ goto out;
}
inode = file_inode(nf->nf_file);
@@ -7934,7 +7936,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
spin_unlock(&flctx->flc_lock);
}
- nfsd_file_put(nf);
+out:
+ spin_unlock(&fp->fi_lock);
return status;
}
@@ -7944,10 +7947,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
* @cstate: NFSv4 COMPOUND state
* @u: RELEASE_LOCKOWNER arguments
*
- * The lockowner's so_count is bumped when a lock record is added
- * or when copying a conflicting lock. The latter case is brief,
- * but can lead to fleeting false positives when looking for
- * locks-in-use.
+ * Check if theree are any locks still held and if not - free the lockowner
+ * and any lock state that is owned.
*
* Return values:
* %nfs_ok: lockowner released or not found
@@ -7983,10 +7984,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
spin_unlock(&clp->cl_lock);
return nfs_ok;
}
- if (atomic_read(&lo->lo_owner.so_count) != 2) {
- spin_unlock(&clp->cl_lock);
- nfs4_put_stateowner(&lo->lo_owner);
- return nfserr_locks_held;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
+ }
}
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 984ffdaeed6c..5764f91d283e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -18,10 +18,11 @@
struct ovl_lookup_data {
struct super_block *sb;
- struct vfsmount *mnt;
+ const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
bool opaque;
+ bool xwhiteouts;
bool stop;
bool last;
char *redirect;
@@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
-{
- return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
-}
-
static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
+ base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
size_t prelen, const char *post,
struct dentry **ret, bool drop_negative)
{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
struct dentry *this;
struct path path;
int err;
bool last_element = !post[0];
+ bool is_upper = d->layer->idx == 0;
+ char val;
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
@@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
}
path.dentry = this;
- path.mnt = d->mnt;
- if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
+ path.mnt = d->layer->mnt;
+ if (ovl_path_is_whiteout(ofs, &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
+ err = ovl_check_metacopy_xattr(ofs, &path, NULL);
if (err < 0)
goto out_err;
@@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+ /* overlay.opaque=x means xwhiteouts directory */
+ val = ovl_get_opaquedir_val(ofs, &path);
+ if (last_element && !is_upper && val == 'x') {
+ d->xwhiteouts = true;
+ ovl_layer_set_xwhiteouts(ofs, d->layer);
+ } else if (val == 'y') {
d->stop = true;
if (last_element)
d->opaque = true;
@@ -863,7 +868,8 @@ fail:
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
*/
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer)
{
struct ovl_entry *oe = OVL_E(dentry);
struct ovl_path *lowerstack = ovl_lowerstack(oe);
@@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
BUG_ON(idx < 0);
if (idx == 0) {
ovl_path_upper(dentry, path);
- if (path->dentry)
+ if (path->dentry) {
+ *layer = &OVL_FS(dentry->d_sb)->layers[0];
return ovl_numlower(oe) ? 1 : -1;
+ }
idx++;
}
BUG_ON(idx > ovl_numlower(oe));
path->dentry = lowerstack[idx - 1].dentry;
- path->mnt = lowerstack[idx - 1].layer->mnt;
+ *layer = lowerstack[idx - 1].layer;
+ path->mnt = (*layer)->mnt;
return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
}
@@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- d.mnt = ovl_upper_mnt(ofs);
+ d.layer = &ofs->layers[0];
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
- d.mnt = lower.layer->mnt;
+ d.layer = lower.layer;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperopaque)
ovl_dentry_set_opaque(dentry);
+ if (d.xwhiteouts)
+ ovl_dentry_set_xwhiteouts(dentry);
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5ba11eb43767..ee949f3e7c77 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -50,7 +50,6 @@ enum ovl_xattr {
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
OVL_XATTR_XWHITEOUT,
- OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -70,6 +69,8 @@ enum ovl_entry_flag {
OVL_E_UPPER_ALIAS,
OVL_E_OPAQUE,
OVL_E_CONNECTED,
+ /* Lower stack may contain xwhiteout entries */
+ OVL_E_XWHITEOUTS,
};
enum {
@@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer);
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
void ovl_dentry_set_upper_alias(struct dentry *dentry);
bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
@@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox);
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
@@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb,
.mnt = ovl_upper_mnt(ofs),
};
- return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+ return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
+}
+
+static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
+ const struct path *path)
+{
+ return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
}
static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
@@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer);
int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 5fa9c58af65f..cb449ab310a7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -40,6 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
+ /* xwhiteouts were found on this layer */
+ bool has_xwhiteouts;
};
struct ovl_path {
@@ -59,7 +61,7 @@ struct ovl_fs {
unsigned int numfs;
/* Number of data-only lower layers */
unsigned int numdatalayer;
- const struct ovl_layer *layers;
+ struct ovl_layer *layers;
struct ovl_sb *fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e71156baa7bc..0ca8af060b0c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- rdd->in_xwhiteouts_dir = rdd->dentry &&
- ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.is_lowest = false,
};
int idx, next;
+ const struct ovl_layer *layer;
for (idx = 0; idx != -1; idx = next) {
- next = ovl_path_next(idx, dentry, &realpath);
+ next = ovl_path_next(idx, dentry, &realpath, &layer);
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+ rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
+ ovl_dentry_has_xwhiteouts(dentry);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4ab66e3d4cff..2eef6c70b2ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1249,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
struct ovl_entry *oe)
{
struct dentry *root;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct ovl_path *lowerpath = ovl_lowerstack(oe);
unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
int fsid = lowerpath->layer->fsid;
@@ -1270,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_set_flag(OVL_IMPURE, d_inode(root));
}
+ /* Look for xwhiteouts marker except in the lowermost layer */
+ for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+ struct path path = {
+ .mnt = lowerpath->layer->mnt,
+ .dentry = lowerpath->dentry,
+ };
+
+ /* overlay.opaque=x means xwhiteouts directory */
+ if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+ ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+ ovl_dentry_set_xwhiteouts(root);
+ }
+ }
+
/* Root is always merge -> can have whiteouts */
ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0217094c23ea..a8e17f14d7a2 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry)
ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
}
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+/*
+ * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
+ * dentry to dcache, while readdir of that same directory happens after
+ * the overlay dir dentry is in dcache, so if some cpu observes that
+ * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
+ * for the layers where xwhiteouts marker was found in that merge dir.
+ */
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer)
+{
+ if (layer->has_xwhiteouts)
+ return;
+
+ /* Write once to read-mostly layer properties */
+ ofs->layers[layer->idx].has_xwhiteouts = true;
+}
+
/*
* For hard links and decoded file handles, it's possible for ovl_dentry_upper()
* to return positive, while there's no actual upper alias for the inode.
@@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
return res >= 0;
}
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
-{
- struct dentry *dentry = path->dentry;
- int res;
-
- /* xattr.whiteouts must be a directory */
- if (!d_is_dir(dentry))
- return false;
-
- res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
- return res >= 0;
-}
-
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -811,20 +825,17 @@ fail:
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox)
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
if (!d_is_dir(path->dentry))
- return false;
+ return 0;
res = ovl_path_getxattr(ofs, path, ox, &val, 1);
- if (res == 1 && val == 'y')
- return true;
-
- return false;
+ return res == 1 ? val : 0;
}
#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
@@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
-#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
- OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index d64a306a414b..971892620504 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -151,7 +151,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
return -EOPNOTSUPP;
ses = tcon->ses;
- server = ses->server;
+ server = cifs_pick_channel(ses);
cfids = tcon->cfids;
if (!server->ops->new_lease_key)
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 60027f5aebe8..3e4209f41c18 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
tcon->bytes_written = 0;
+ tcon->stats_from_time = ktime_get_real_seconds();
spin_unlock(&tcon->stat_lock);
if (server->ops->clear_stats)
server->ops->clear_stats(tcon);
@@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
- seq_printf(m, "\nSMBs: %d",
- atomic_read(&tcon->num_smbs_sent));
+ seq_printf(m, "\nSMBs: %d since %ptTs UTC",
+ atomic_read(&tcon->num_smbs_sent),
+ &tcon->stats_from_time);
if (server->ops->print_stats)
server->ops->print_stats(m, tcon);
}
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 99b0ade833aa..e902de4e475a 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -430,7 +430,7 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
+ if (tcon->ses->server->retrans)
+ seq_printf(s, ",retrans=%u", tcon->ses->server->retrans);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
@@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int cifs_drop_inode(struct inode *inode)
@@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s
if (rc < 0)
goto set_failed;
- netfs_resize_file(&src_cifsi->netfs, src_end);
+ netfs_resize_file(&src_cifsi->netfs, src_end, true);
fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
return 0;
@@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
smb_file_src, smb_file_target, off, len, destoff);
if (rc == 0 && new_size > i_size_read(target_inode)) {
truncate_setsize(target_inode, new_size);
- netfs_resize_file(&target_cifsi->netfs, new_size);
+ netfs_resize_file(&target_cifsi->netfs, new_size, true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 879d5ef8a66e..20036fb16cec 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -204,6 +204,8 @@ struct cifs_open_info_data {
};
} reparse;
char *symlink_target;
+ struct cifs_sid posix_owner;
+ struct cifs_sid posix_group;
union {
struct smb2_file_all_info fi;
struct smb311_posix_qinfo posix_fi;
@@ -751,6 +753,7 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
unsigned int min_offload;
+ unsigned int retrans;
__le16 compress_algorithm;
__u16 signing_algorithm;
__le16 cipher_type;
@@ -1207,6 +1210,7 @@ struct cifs_tcon {
__u64 bytes_read;
__u64 bytes_written;
spinlock_t stat_lock; /* protects the two fields above */
+ time64_t stats_from_time;
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
FILE_SYSTEM_UNIX_INFO fsUnixInfo;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 3052a208c6ca..bfd568f89710 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1574,6 +1574,9 @@ static int match_server(struct TCP_Server_Info *server,
if (server->min_offload != ctx->min_offload)
return 0;
+ if (server->retrans != ctx->retrans)
+ return 0;
+
return 1;
}
@@ -1798,6 +1801,7 @@ smbd_connected:
goto out_err_crypto_release;
}
tcp_ses->min_offload = ctx->min_offload;
+ tcp_ses->retrans = ctx->retrans;
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 1b4262aff8fa..90da81d0372a 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le
xas_for_each(&xas, folio, end) {
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -2651,7 +2651,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
continue;
if (xa_is_value(folio))
break;
- if (folio_index(folio) != index)
+ if (folio->index != index)
break;
if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
@@ -2899,7 +2899,7 @@ redo_folio:
goto skip_write;
}
- if (folio_mapping(folio) != mapping ||
+ if (folio->mapping != mapping ||
!folio_test_dirty(folio)) {
start += folio_size(folio);
folio_unlock(folio);
@@ -5043,27 +5043,13 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-#ifdef CONFIG_CIFS_FSCACHE
-static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- cifs_inode_cookie(mapping->host));
-}
-#else
-#define cifs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
@@ -5087,7 +5073,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index a3493da12ad1..52cbef2eeb28 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("dir_mode", Opt_dirmode),
fsparam_u32("port", Opt_port),
fsparam_u32("min_enc_offload", Opt_min_enc_offload),
+ fsparam_u32("retrans", Opt_retrans),
fsparam_u32("esize", Opt_min_enc_offload),
fsparam_u32("bsize", Opt_blocksize),
fsparam_u32("rasize", Opt_rasize),
@@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_min_enc_offload:
ctx->min_offload = result.uint_32;
break;
+ case Opt_retrans:
+ ctx->retrans = result.uint_32;
+ break;
case Opt_blocksize:
/*
* inode blocksize realistically should never need to be
@@ -1619,6 +1623,8 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->backupuid_specified = false; /* no backup intent for a user */
ctx->backupgid_specified = false; /* no backup intent for a group */
+ ctx->retrans = 1;
+
/*
* short int override_uid = -1;
* short int override_gid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index cf46916286d0..182ce11cbe93 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -118,6 +118,7 @@ enum cifs_param {
Opt_file_mode,
Opt_dirmode,
Opt_min_enc_offload,
+ Opt_retrans,
Opt_blocksize,
Opt_rasize,
Opt_rsize,
@@ -245,6 +246,7 @@ struct smb3_fs_context {
unsigned int rsize;
unsigned int wsize;
unsigned int min_offload;
+ unsigned int retrans;
bool sockopt_tcp_nodelay:1;
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index e5cad149f5a2..c4a3cb736881 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_
if (ret < 0)
return ret;
- ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
no_space_allocated_yet);
if (ret == 0)
ret = fscache_write(&cres, start, &iter, NULL, NULL);
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 9f37c1758f73..f0989484f2c6 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -665,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group,
struct super_block *sb)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -722,8 +720,8 @@ out_reparse:
fattr->cf_symlink_target = data->symlink_target;
data->symlink_target = NULL;
}
- sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
- sid_to_id(cifs_sb, group, fattr, SIDGROUP);
+ sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -1070,9 +1068,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
const unsigned int xid,
struct cifs_tcon *tcon,
const char *full_path,
- struct cifs_fattr *fattr,
- struct cifs_sid *owner,
- struct cifs_sid *group)
+ struct cifs_fattr *fattr)
{
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1117,7 +1113,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
}
if (tcon->posix_extensions)
- smb311_posix_info_to_fattr(fattr, data, owner, group, sb);
+ smb311_posix_info_to_fattr(fattr, data, sb);
else
cifs_open_info_to_fattr(fattr, data, sb);
out:
@@ -1171,8 +1167,7 @@ static int cifs_get_fattr(struct cifs_open_info_data *data,
*/
if (cifs_open_data_reparse(data)) {
rc = reparse_info_to_fattr(data, sb, xid, tcon,
- full_path, fattr,
- NULL, NULL);
+ full_path, fattr);
} else {
cifs_open_info_to_fattr(fattr, data, sb);
}
@@ -1317,10 +1312,10 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
const unsigned int xid)
{
struct cifs_open_info_data tmp_data = {};
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct cifs_sid owner, group;
int tmprc;
int rc = 0;
@@ -1328,14 +1323,14 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/*
* 1. Fetch file metadata if not provided (data)
*/
if (!data) {
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, &tmp_data,
- &owner, &group);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, &tmp_data);
data = &tmp_data;
}
@@ -1347,11 +1342,9 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
case 0:
if (cifs_open_data_reparse(data)) {
rc = reparse_info_to_fattr(data, sb, xid, tcon,
- full_path, fattr,
- &owner, &group);
+ full_path, fattr);
} else {
- smb311_posix_info_to_fattr(fattr, data,
- &owner, &group, sb);
+ smb311_posix_info_to_fattr(fattr, data, sb);
}
break;
case -EREMOTE:
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index c2137ea3c253..0748d7b757b9 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled)
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
+ ret_buf->stats_from_time = ktime_get_real_seconds();
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 056cae1ddcce..94255401b38d 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -133,14 +133,14 @@ retry:
* Query dir responses don't provide enough
* information about reparse points other than
* their reparse tags. Save an invalidation by
- * not clobbering the existing mode, size and
- * symlink target (if any) when reparse tag and
- * ctime haven't changed.
+ * not clobbering some existing attributes when
+ * reparse tag and ctime haven't changed.
*/
rc = 0;
if (fattr->cf_cifsattrs & ATTR_REPARSE) {
if (likely(reparse_inode_match(inode, fattr))) {
fattr->cf_mode = inode->i_mode;
+ fattr->cf_rdev = inode->i_rdev;
fattr->cf_eof = CIFS_I(inode)->server_eof;
fattr->cf_symlink_target = NULL;
} else {
@@ -645,10 +645,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
static int is_dir_changed(struct file *file)
{
struct inode *inode = file_inode(file);
- struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+ struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode);
- if (cifsInfo->time == 0)
- return 1; /* directory was changed, perhaps due to unlink */
+ if (cifs_inode_info->time == 0)
+ return 1; /* directory was changed, e.g. unlink or new file */
else
return 0;
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 5053a5550abe..a652200540c8 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -56,6 +56,35 @@ static inline __u32 file_create_options(struct dentry *dentry)
return 0;
}
+/* Parse owner and group from SMB3.1.1 POSIX query info */
+static int parse_posix_sids(struct cifs_open_info_data *data,
+ struct kvec *rsp_iov)
+{
+ struct smb2_query_info_rsp *qi = rsp_iov->iov_base;
+ unsigned int out_len = le32_to_cpu(qi->OutputBufferLength);
+ unsigned int qi_len = sizeof(data->posix_fi);
+ int owner_len, group_len;
+ u8 *sidsbuf, *sidsbuf_end;
+
+ if (out_len <= qi_len)
+ return -EINVAL;
+
+ sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len;
+ sidsbuf_end = sidsbuf + out_len - qi_len;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_owner, sidsbuf, owner_len);
+ group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_group, sidsbuf + owner_len, group_len);
+ return 0;
+}
+
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
@@ -69,7 +98,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
__u32 desired_access, __u32 create_disposition,
__u32 create_options, umode_t mode, struct kvec *in_iov,
int *cmds, int num_cmds, struct cifsFileInfo *cfile,
- __u8 **extbuf, size_t *extbuflen,
struct kvec *out_iov, int *out_buftype)
{
@@ -494,21 +522,9 @@ finished:
&rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */,
(char *)&idata->posix_fi);
}
- if (rc == 0) {
- unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
-
- if (length > sizeof(idata->posix_fi)) {
- char *base = (char *)rsp_iov[i + 1].iov_base +
- le16_to_cpu(qi_rsp->OutputBufferOffset) +
- sizeof(idata->posix_fi);
- *extbuflen = length - sizeof(idata->posix_fi);
- *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
- if (!*extbuf)
- rc = -ENOMEM;
- } else {
- rc = -EINVAL;
- }
- }
+ if (rc == 0)
+ rc = parse_posix_sids(idata, &rsp_iov[i + 1]);
+
SMB2_query_info_free(&rqst[num_rqst++]);
if (rc)
trace_smb3_posix_query_info_compound_err(xid, ses->Suid,
@@ -662,7 +678,7 @@ int smb2_query_path_info(const unsigned int xid,
struct smb2_hdr *hdr;
struct kvec in_iov[2], out_iov[3] = {};
int out_buftype[3] = {};
- int cmds[2] = { SMB2_OP_QUERY_INFO, };
+ int cmds[2];
bool islink;
int i, num_cmds;
int rc, rc2;
@@ -670,20 +686,36 @@ int smb2_query_path_info(const unsigned int xid,
data->adjust_tz = false;
data->reparse_point = false;
- if (strcmp(full_path, ""))
- rc = -ENOENT;
- else
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
- /* If it is a root and its handle is cached then use it */
- if (!rc) {
- if (cfid->file_all_info_is_valid) {
- memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
+ /*
+ * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX.
+ * Create SMB2_query_posix_info worker function to do non-compounded
+ * query when we already have an open file handle for this. For now this
+ * is fast enough (always using the compounded version).
+ */
+ if (!tcon->posix_extensions) {
+ if (*full_path) {
+ rc = -ENOENT;
} else {
- rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, &data->fi);
+ rc = open_cached_dir(xid, tcon, full_path,
+ cifs_sb, false, &cfid);
+ }
+ /* If it is a root and its handle is cached then use it */
+ if (!rc) {
+ if (cfid->file_all_info_is_valid) {
+ memcpy(&data->fi, &cfid->file_all_info,
+ sizeof(data->fi));
+ } else {
+ rc = SMB2_query_info(xid, tcon,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
+ &data->fi);
+ }
+ close_cached_dir(cfid);
+ return rc;
}
- close_cached_dir(cfid);
- return rc;
+ cmds[0] = SMB2_OP_QUERY_INFO;
+ } else {
+ cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
}
in_iov[0].iov_base = data;
@@ -693,9 +725,8 @@ int smb2_query_path_info(const unsigned int xid,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- in_iov, cmds, 1, cfile,
- NULL, NULL, out_iov, out_buftype);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, 1, cfile, out_iov, out_buftype);
hdr = out_iov[0].iov_base;
/*
* If first iov is unset, then SMB session was dropped or we've got a
@@ -707,6 +738,10 @@ int smb2_query_path_info(const unsigned int xid,
switch (rc) {
case 0:
case -EOPNOTSUPP:
+ /*
+ * BB TODO: When support for special files added to Samba
+ * re-verify this path.
+ */
rc = parse_create_response(data, cifs_sb, &out_iov[0]);
if (rc || !data->reparse_point)
goto out;
@@ -722,8 +757,8 @@ int smb2_query_path_info(const unsigned int xid,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds,
- num_cmds, cfile, NULL, NULL, NULL, NULL);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, num_cmds, cfile, NULL, NULL);
break;
case -EREMOTE:
break;
@@ -746,101 +781,6 @@ out:
return rc;
}
-int smb311_posix_query_path_info(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group)
-{
- int rc;
- __u32 create_options = 0;
- struct cifsFileInfo *cfile;
- struct kvec in_iov[2], out_iov[3] = {};
- int out_buftype[3] = {};
- __u8 *sidsbuf = NULL;
- __u8 *sidsbuf_end = NULL;
- size_t sidsbuflen = 0;
- size_t owner_len, group_len;
- int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, };
- int i, num_cmds;
-
- data->adjust_tz = false;
- data->reparse_point = false;
-
- /*
- * BB TODO: Add support for using the cached root handle.
- * Create SMB2_query_posix_info worker function to do non-compounded query
- * when we already have an open file handle for this. For now this is fast enough
- * (always using the compounded version).
- */
- in_iov[0].iov_base = data;
- in_iov[0].iov_len = sizeof(*data);
- in_iov[1] = in_iov[0];
-
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds, 1,
- cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype);
- /*
- * If first iov is unset, then SMB session was dropped or we've got a
- * cached open file (@cfile).
- */
- if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER)
- goto out;
-
- switch (rc) {
- case 0:
- case -EOPNOTSUPP:
- /* BB TODO: When support for special files added to Samba re-verify this path */
- rc = parse_create_response(data, cifs_sb, &out_iov[0]);
- if (rc || !data->reparse_point)
- goto out;
-
- if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
- /* symlink already parsed in create response */
- num_cmds = 1;
- } else {
- cmds[1] = SMB2_OP_GET_REPARSE;
- num_cmds = 2;
- }
- create_options |= OPEN_REPARSE_POINT;
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds,
- num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL);
- break;
- }
-
-out:
- if (rc == 0) {
- sidsbuf_end = sidsbuf + sidsbuflen;
-
- owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
- if (owner_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(owner, sidsbuf, owner_len);
-
- group_len = posix_info_sid_size(
- sidsbuf + owner_len, sidsbuf_end);
- if (group_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(group, sidsbuf + owner_len, group_len);
- }
-
- kfree(sidsbuf);
- for (i = 0; i < ARRAY_SIZE(out_buftype); i++)
- free_rsp_buf(out_buftype[i], out_iov[i].iov_base);
- return rc;
-}
-
int
smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
struct cifs_tcon *tcon, const char *name,
@@ -848,9 +788,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
{
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, mode, NULL,
- &(int){SMB2_OP_MKDIR}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, mode,
+ NULL, &(int){SMB2_OP_MKDIR}, 1,
+ NULL, NULL, NULL);
}
void
@@ -875,7 +815,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
&(int){SMB2_OP_SET_INFO}, 1,
- cfile, NULL, NULL, NULL, NULL);
+ cfile, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -887,8 +827,9 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name,
DELETE, FILE_OPEN, CREATE_NOT_FILE,
- ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_RMDIR}, 1,
+ NULL, NULL, NULL);
}
int
@@ -897,8 +838,9 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL);
}
static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
@@ -919,8 +861,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
in_iov.iov_base = smb2_to_name;
in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, create_options, ACL_NO_MODE, &in_iov,
- &command, 1, cfile, NULL, NULL, NULL, NULL);
+ FILE_OPEN, create_options, ACL_NO_MODE,
+ &in_iov, &command, 1, cfile, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -971,7 +913,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
FILE_WRITE_DATA, FILE_OPEN,
0, ACL_NO_MODE, &in_iov,
&(int){SMB2_OP_SET_EOF}, 1,
- cfile, NULL, NULL, NULL, NULL);
+ cfile, NULL, NULL);
}
int
@@ -999,8 +941,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
0, ACL_NO_MODE, &in_iov,
- &(int){SMB2_OP_SET_INFO}, 1, cfile,
- NULL, NULL, NULL, NULL);
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
@@ -1035,7 +977,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL, NULL, NULL);
+ cmds, 2, cfile, NULL, NULL);
if (!rc) {
rc = smb311_posix_get_inode_info(&new, full_path,
data, sb, xid);
@@ -1045,7 +987,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL, NULL, NULL);
+ cmds, 2, cfile, NULL, NULL);
if (!rc) {
rc = cifs_get_inode_info(&new, full_path,
data, sb, xid, NULL);
@@ -1072,8 +1014,8 @@ int smb2_query_reparse_point(const unsigned int xid,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
- &(int){SMB2_OP_GET_REPARSE}, 1, cfile,
- NULL, NULL, NULL, NULL);
+ &(int){SMB2_OP_GET_REPARSE}, 1,
+ cfile, NULL, NULL);
if (rc)
goto out;
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 1a90dd78b238..ac1895358908 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+ {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"},
+ {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"},
{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 01a5bd7e6a30..d9553c2556a2 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -614,7 +614,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
"multichannel not available\n"
"Empty network interface list returned by server %s\n",
ses->server->hostname);
- rc = -EINVAL;
+ rc = -EOPNOTSUPP;
+ ses->iface_last_update = jiffies;
goto out;
}
@@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
ses->iface_count++;
spin_unlock(&ses->iface_lock);
- ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
next = le32_to_cpu(p->Next);
@@ -734,11 +734,7 @@ next_iface:
if ((bytes_left > 8) || p->Next)
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
- if (!ses->iface_count) {
- rc = -EINVAL;
- goto out;
- }
+ ses->iface_last_update = jiffies;
out:
/*
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index bd25c34dc398..288199f0b987 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -156,6 +156,57 @@ out:
return;
}
+/* helper function for code reuse */
+static int
+cifs_chan_skip_or_disable(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ bool from_reconnect)
+{
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
+
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS,
+ "server %s does not support multichannel anymore. Skip secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, from_reconnect);
+
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+skip_terminate:
+ mutex_unlock(&ses->session_mutex);
+ return -EHOSTDOWN;
+ }
+
+ cifs_server_dbg(VFS,
+ "server does not support multichannel anymore. Disable all other channels\n");
+ cifs_disable_secondary_channels(ses);
+
+
+ return 0;
+}
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server, bool from_reconnect)
@@ -164,8 +215,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
int xid;
- struct TCP_Server_Info *pserver;
- unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -310,44 +359,11 @@ again:
*/
if (ses->chan_count > 1 &&
!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- if (SERVER_IS_CHAN(server)) {
- cifs_dbg(VFS, "server %s does not support " \
- "multichannel anymore. skipping secondary channel\n",
- ses->server->hostname);
-
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- goto skip_terminate;
- }
-
- ses->chans[chan_index].server = NULL;
- spin_unlock(&ses->chan_lock);
-
- /*
- * the above reference of server by channel
- * needs to be dropped without holding chan_lock
- * as cifs_put_tcp_session takes a higher lock
- * i.e. cifs_tcp_ses_lock
- */
- cifs_put_tcp_session(server, from_reconnect);
-
- server->terminate = true;
- cifs_signal_cifsd_for_reconnect(server, false);
-
- /* mark primary server as needing reconnect */
- pserver = server->primary_server;
- cifs_signal_cifsd_for_reconnect(pserver, false);
-
-skip_terminate:
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
goto out;
- } else {
- cifs_server_dbg(VFS, "does not support " \
- "multichannel anymore. disabling all other channels\n");
- cifs_disable_secondary_channels(ses);
}
}
@@ -395,20 +411,35 @@ skip_sess_setup:
rc = SMB3_request_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc)
+ if (rc == -EOPNOTSUPP) {
+ /*
+ * some servers like Azure SMB server do not advertise
+ * that multichannel has been disabled with server
+ * capabilities, rather return STATUS_NOT_IMPLEMENTED.
+ * treat this as server not supporting multichannel
+ */
+
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ goto skip_add_channels;
+ } else if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
if (ses->chan_max > ses->chan_count &&
+ ses->iface_count &&
!SERVER_IS_CHAN(server)) {
if (ses->chan_count == 1)
cifs_server_dbg(VFS, "supports multichannel now\n");
cifs_try_adding_channels(ses);
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
}
} else {
mutex_unlock(&ses->session_mutex);
}
+skip_add_channels:
if (smb2_command != SMB2_INTERNAL_CMD)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -1958,10 +1989,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
__le16 *unc_path = NULL;
int flags = 0;
unsigned int total_len;
- struct TCP_Server_Info *server;
-
- /* always use master channel */
- server = ses->server;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
cifs_dbg(FYI, "TCON\n");
@@ -2094,6 +2122,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
struct smb2_tree_disconnect_req *req; /* response is trivial */
int rc = 0;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
int flags = 0;
unsigned int total_len;
struct kvec iov[1];
@@ -2116,7 +2145,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
invalidate_all_cached_dirs(tcon);
- rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
+ rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server,
(void **) &req,
&total_len);
if (rc)
@@ -2134,7 +2163,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rc = cifs_send_recv(xid, ses, ses->server,
+ rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
@@ -2279,7 +2308,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server,
noff = le16_to_cpu(cc->NameOffset);
nlen = le16_to_cpu(cc->NameLength);
- if (noff + nlen >= doff)
+ if (noff + nlen > doff)
return -EINVAL;
name = (char *)cc + noff;
@@ -3918,7 +3947,7 @@ void smb2_reconnect_server(struct work_struct *work)
struct cifs_ses *ses, *ses2;
struct cifs_tcon *tcon, *tcon2;
struct list_head tmp_list, tmp_ses_list;
- bool tcon_exist = false, ses_exist = false;
+ bool ses_exist = false;
bool tcon_selected = false;
int rc;
bool resched = false;
@@ -3964,7 +3993,7 @@ void smb2_reconnect_server(struct work_struct *work)
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
list_add_tail(&tcon->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
}
}
/*
@@ -3973,7 +4002,7 @@ void smb2_reconnect_server(struct work_struct *work)
*/
if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
cifs_smb_ses_inc_refcount(ses);
}
/*
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 343ada691e76..0034b537b0b3 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -299,9 +299,7 @@ int smb311_posix_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group);
+ struct cifs_open_info_data *data);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h
index a9e958166fc5..9c6d79b0bd49 100644
--- a/fs/smb/client/smb2status.h
+++ b/fs/smb/client/smb2status.h
@@ -982,6 +982,8 @@ struct ntstatus {
#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466)
+#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467)
#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c
index 4a4b2b03ff33..b931a99ab9c8 100644
--- a/fs/smb/server/asn1.c
+++ b/fs/smb/server/asn1.c
@@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
{
struct ksmbd_conn *conn = context;
+ if (!vlen)
+ return -EINVAL;
+
conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL);
if (!conn->mechToken)
return -ENOMEM;
+ conn->mechTokenLen = (unsigned int)vlen;
+
return 0;
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index d311c2ee10bd..09e1e7771592 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -416,13 +416,7 @@ static void stop_sessions(void)
again:
down_read(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list) {
- struct task_struct *task;
-
t = conn->transport;
- task = t->handler;
- if (task)
- ksmbd_debug(CONN, "Stop session handler %s/%d\n",
- task->comm, task_pid_nr(task));
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
up_read(&conn_list_lock);
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 3c005246a32e..0e04cf8b1d89 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -88,6 +88,7 @@ struct ksmbd_conn {
__u16 dialect;
char *mechToken;
+ unsigned int mechTokenLen;
struct ksmbd_conn_ops *conn_ops;
@@ -134,7 +135,6 @@ struct ksmbd_transport_ops {
struct ksmbd_transport {
struct ksmbd_conn *conn;
struct ksmbd_transport_ops *ops;
- struct task_struct *handler;
};
#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ)
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 001926d3b348..53dfaac425c6 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1197,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
bool prev_op_has_lease;
__le32 prev_op_state = 0;
+ /* Only v2 leases handle the directory */
+ if (S_ISDIR(file_inode(fp->filp)->i_mode)) {
+ if (!lctx || lctx->version != 2)
+ return 0;
+ }
+
opinfo = alloc_opinfo(work, pid, tid);
if (!opinfo)
return -ENOMEM;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 3143819935dc..ba7a72a6a4f4 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
char *name;
unsigned int name_off, name_len, secbuf_len;
- secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ secbuf_len = conn->mechTokenLen;
+ else
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
if (secbuf_len < sizeof(struct authenticate_message)) {
ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
return NULL;
@@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work,
struct authenticate_message *authblob;
authblob = user_authblob(conn, req);
- sz = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ sz = conn->mechTokenLen;
+ else
+ sz = le16_to_cpu(req->SecurityBufferLength);
rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess);
if (rc) {
set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
@@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off = le16_to_cpu(req->SecurityBufferOffset);
negblob_len = le16_to_cpu(req->SecurityBufferLength);
- if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) ||
- negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) {
rc = -EINVAL;
goto out_err;
}
@@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off);
if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
- if (conn->mechToken)
+ if (conn->mechToken) {
negblob = (struct negotiate_message *)conn->mechToken;
+ negblob_len = conn->mechTokenLen;
+ }
+ }
+
+ if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ rc = -EINVAL;
+ goto out_err;
}
if (server_conf.auth_mechs & conn->auth_mechs) {
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c5629a68c8b7..8faa25c6e129 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
{
struct smb_direct_transport *t;
+ struct task_struct *handler;
int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
if (ret)
goto out_err;
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- smb_direct_port);
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
- ret = PTR_ERR(KSMBD_TRANS(t)->handler);
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ smb_direct_port);
+ if (IS_ERR(handler)) {
+ ret = PTR_ERR(handler);
pr_err("Can't start thread\n");
goto out_err;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index eff7a1d793f0..9d4222154dcc 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
+ struct task_struct *handler;
t = alloc_transport(client_sk);
if (!t) {
@@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
goto out_error;
}
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn,
+ "ksmbd:%u",
+ ksmbd_tcp_get_port(csin));
+ if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
- rc = PTR_ERR(KSMBD_TRANS(t)->handler);
+ rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 6795fda2af19..6b211522a13e 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex);
/* Choose something "unique" ;-) */
#define EVENTFS_FILE_INODE_INO 0x12c4e37
-#define EVENTFS_DIR_INODE_INO 0x134b2f5
+
+/* Just try to make something consistent and unique */
+static int eventfs_dir_ino(struct eventfs_inode *ei)
+{
+ if (!ei->ino)
+ ei->ino = get_next_ino();
+
+ return ei->ino;
+}
/*
* The eventfs_inode (ei) itself is protected by SRCU. It is released from
@@ -396,7 +404,7 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
inode->i_fop = &eventfs_file_operations;
/* All directories will have the same inode number */
- inode->i_ino = EVENTFS_DIR_INODE_INO;
+ inode->i_ino = eventfs_dir_ino(ei);
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
@@ -802,7 +810,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx)
name = ei_child->name;
- ino = EVENTFS_DIR_INODE_INO;
+ ino = eventfs_dir_ino(ei_child);
if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR))
goto out_dec;
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 12b7d0150ae9..45397df9bb65 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -55,6 +55,10 @@ struct eventfs_inode {
struct eventfs_attr *entry_attrs;
struct eventfs_attr attr;
void *data;
+ unsigned int is_freed:1;
+ unsigned int is_events:1;
+ unsigned int nr_entries:30;
+ unsigned int ino;
/*
* Union - used for deletion
* @llist: for calling dput() if needed after RCU
@@ -64,9 +68,6 @@ struct eventfs_inode {
struct llist_node llist;
struct rcu_head rcu;
};
- unsigned int is_freed:1;
- unsigned int is_events:1;
- unsigned int nr_entries:30;
};
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 98aaca933bdd..f362345467fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account(
struct xfs_bmalloca *ap)
{
bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->flags & XFS_BMAPI_ATTRFORK);
+ !(ap->flags & XFS_BMAPI_ATTRFORK);
uint fld;
if (ap->flags & XFS_BMAPI_COWFORK) {