diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 19:41:22 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 19:41:22 +0300 |
commit | 87045e6546078dae215d1bd3b2bc82b3ada3ca77 (patch) | |
tree | f3d816b9834ca959514f6e399fb9f505871d2729 /fs/btrfs/tree-log.c | |
parent | 9c849ce86e0fa93a218614eac562ace44053d7ce (diff) | |
parent | 0d977e0eba234e01a60bdde27314dc21374201b3 (diff) | |
download | linux-87045e6546078dae215d1bd3b2bc82b3ada3ca77.tar.xz |
Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"The highlights of this round are integrations with fs-verity and
idmapped mounts, the rest is usual mix of minor improvements, speedups
and cleanups.
There are some patches outside of btrfs, namely updating some VFS
interfaces, all straightforward and acked.
Features:
- fs-verity support, using standard ioctls, backward compatible with
read-only limitation on inodes with previously enabled fs-verity
- idmapped mount support
- make mount with rescue=ibadroots more tolerant to partially damaged
trees
- allow raid0 on a single device and raid10 on two devices,
degenerate cases but might be useful as an intermediate step during
conversion to other profiles
- zoned mode block group auto reclaim can be disabled via sysfs knob
Performance improvements:
- continue readahead of node siblings even if target node is in
memory, could speed up full send (on sample test +11%)
- batching of delayed items can speed up creating many files
- fsync/tree-log speedups
- avoid unnecessary work (gains +2% throughput, -2% run time on
sample load)
- reduced lock contention on renames (on dbench +4% throughput,
up to -30% latency)
Fixes:
- various zoned mode fixes
- preemptive flushing threshold tuning, avoid excessive work on
almost full filesystems
Core:
- continued subpage support, preparation for implementing remaining
features like compression and defragmentation; with some
limitations, write is now enabled on 64K page systems with 4K
sectors, still considered experimental
- no readahead on compressed reads
- inline extents disabled
- disabled raid56 profile conversion and mount
- improved flushing logic, fixing early ENOSPC on some workloads
- inode flags have been internally split to read-only and read-write
incompat bit parts, used by fs-verity
- new tree items for fs-verity
- descriptor item
- Merkle tree item
- inode operations extended to be namespace-aware
- cleanups and refactoring
Generic code changes:
- fs: new export filemap_fdatawrite_wbc
- fs: removed sync_inode
- block: bio_trim argument type fixups
- vfs: add namespace-aware lookup"
* tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (114 commits)
btrfs: reset replace target device to allocation state on close
btrfs: zoned: fix ordered extent boundary calculation
btrfs: do not do preemptive flushing if the majority is global rsv
btrfs: reduce the preemptive flushing threshold to 90%
btrfs: tree-log: check btrfs_lookup_data_extent return value
btrfs: avoid unnecessarily logging directories that had no changes
btrfs: allow idmapped mount
btrfs: handle ACLs on idmapped mounts
btrfs: allow idmapped INO_LOOKUP_USER ioctl
btrfs: allow idmapped SUBVOL_SETFLAGS ioctl
btrfs: allow idmapped SET_RECEIVED_SUBVOL ioctls
btrfs: relax restrictions for SNAP_DESTROY_V2 with subvolids
btrfs: allow idmapped SNAP_DESTROY ioctls
btrfs: allow idmapped SNAP_CREATE/SUBVOL_CREATE ioctls
btrfs: check whether fsgid/fsuid are mapped during subvolume creation
btrfs: allow idmapped permission inode op
btrfs: allow idmapped setattr inode op
btrfs: allow idmapped tmpfile inode op
btrfs: allow idmapped symlink inode op
btrfs: allow idmapped mkdir inode op
...
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 102 |
1 files changed, 76 insertions, 26 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e6430ac9bbe8..f7efc26aa82a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { + if (ret < 0) { + goto out; + } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, ins.objectid, ins.offset, 0); @@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, list_del_init(&ctx->list); ctx->log_ret = error; } - - INIT_LIST_HEAD(&root->log_ctxs[index]); } /* @@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - mutex_lock(&root->log_mutex); - if (root->last_log_commit < log_transid) - root->last_log_commit = log_transid; - mutex_unlock(&root->log_mutex); + /* + * We know there can only be one task here, since we have not yet set + * root->log_commit[index1] to 0 and any task attempting to sync the + * log must wait for the previous log transaction to commit if it's + * still in progress or wait for the current log transaction commit if + * someone else already started it. We use <= and not < because the + * first log transaction has an ID of 0. + */ + ASSERT(root->last_log_commit <= log_transid); + root->last_log_commit = log_transid; out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. We can't always rely - * on an inode's logged_trans value, because it's an in-memory only field and - * therefore not persisted. This means that its value is lost if the inode gets - * evicted and loaded again from disk (in which case it has a value of 0, and - * certainly it is smaller then any possible transaction ID), when that happens - * the full_sync flag is set in the inode's runtime flags, so on that case we - * assume eviction happened and ignore the logged_trans value, assuming the - * worst case, that the inode was logged before in the current transaction. + * Check if an inode was logged in the current transaction. This may often + * return some false positives, because logged_trans is an in memory only field, + * not persisted anywhere. This is meant to be used in contexts where a false + * positive has no functional consequences. */ static bool inode_logged(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) @@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; - if (inode->last_trans == trans->transid && - test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + /* + * The inode's logged_trans is always 0 when we load it (because it is + * not persisted in the inode item or elsewhere). So if it is 0, the + * inode was last modified in the current transaction then the inode may + * have been logged before in the current transaction, then evicted and + * loaded again in the current transaction - or may have never been logged + * in the current transaction, but since we can not be sure, we have to + * assume it was, otherwise our callers can leave an inconsistent log. + */ + if (inode->logged_trans == 0 && + inode->last_trans == trans->transid && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; @@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, u64 logged_isize) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - struct btrfs_inode *inode) + struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; int ret; - ret = btrfs_insert_empty_item(trans, log, path, - &inode->location, sizeof(*inode_item)); - if (ret && ret != -EEXIST) + /* + * If we are doing a fast fsync and the inode was logged before in the + * current transaction, then we know the inode was previously logged and + * it exists in the log tree. For performance reasons, in this case use + * btrfs_search_slot() directly with ins_len set to 0 so that we never + * attempt a write lock on the leaf's parent, which adds unnecessary lock + * contention in case there are concurrent fsyncs for other inodes of the + * same subvolume. Using btrfs_insert_empty_item() when the inode item + * already exists can also result in unnecessarily splitting a leaf. + */ + if (!inode_item_dropped && inode->logged_trans == trans->transid) { + ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ASSERT(ret <= 0); + if (ret > 0) + ret = -ENOENT; + } else { + /* + * This means it is the first fsync in the current transaction, + * so the inode item is not in the log and we need to insert it. + * We can never get -EEXIST because we are only called for a fast + * fsync and in case an inode eviction happens after the inode was + * logged before in the current transaction, when we load again + * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime + * flags and set ->logged_trans to 0. + */ + ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + sizeof(*inode_item)); + ASSERT(ret != -EEXIST); + } + if (ret) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, static int extent_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct extent_map *em1, *em2; + const struct extent_map *em1, *em2; em1 = list_entry(a, struct extent_map, list); em2 = list_entry(b, struct extent_map, list); @@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, /* * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists and - * it has the full sync bit set (see btrfs_log_inode()). + * the inode is not updated when we only log that it exists (see + * btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool need_log_inode_item = true; bool xattrs_logged = false; bool recursive_logging = false; + bool inode_item_dropped = true; path = btrfs_alloc_path(); if (!path) @@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; + inode_item_dropped = false; goto log_extents; } @@ -5466,7 +5509,7 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode); + err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); if (err) goto out_unlock; /* @@ -5573,6 +5616,13 @@ static bool need_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* * If this inode does not have new/updated/deleted xattrs since the last * time it was logged and is flagged as logged in the current transaction, * we can skip logging it. As for new/deleted names, those are updated in |