From 07a3bb95ea1f7fa77e360cf3b0f06dff41dbf391 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 23 Jun 2023 23:14:43 +0200 Subject: btrfs: zoned: use vcalloc instead of for vzalloc in btrfs_get_dev_zone_info Use vcalloc that checks potential multiplication overflows. The changes were done using Coccinelle semantic patch. Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 72b90bc19a19..b43c4536e685 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -465,8 +465,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { - zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * - zone_info->nr_zones); + zone_info->zone_cache = vcalloc(zone_info->nr_zones, + sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", -- cgit v1.2.3 From cf4ac2b9049be459dfbaddbeb9322366f1ac2dc6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jun 2023 14:09:37 +0100 Subject: btrfs: scrub: remove redundant division of stripe_nr Variable stripe_nr is being divided by map->num_stripes however the result is never read. The division and assignment are redundant and can be removed. Cleans up clang scan build warning: fs/btrfs/scrub.c:1264:3: warning: Value stored to 'stripe_nr' is never read [deadcode.DeadStores] The code is a leftover from 6ded22c1bfe6 ("btrfs: reduce div64 calls by limiting the number of stripes of a chunk to u32") that converted div64 to normal division, it's the same but previous version did not trigger a warning. Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7289f5bff397..fe1c5bc7f601 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1262,7 +1262,6 @@ static int get_raid56_logic_offset(u64 physical, int num, /* Work out the disk rotation on this stripe-set */ rot = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; -- cgit v1.2.3 From 966de47ff0c9e64d74e1719e4480b7c34f6190fa Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Jun 2023 08:54:30 +0100 Subject: btrfs: remove redundant initialization of variables in log_new_ancestors The variables leaf and slot are initialized when declared but the values assigned to them are never read as they are being re-assigned later on. The initializations are redundant and can be removed. Cleans up clang scan build warnings: fs/btrfs/tree-log.c:6797:25: warning: Value stored to 'leaf' during its initialization is never read [deadcode.DeadStores] fs/btrfs/tree-log.c:6798:7: warning: Value stored to 'slot' during its initialization is never read [deadcode.DeadStores] It's been there since b8aa330d2acb ("Btrfs: improve performance on fsync of files with multiple hardlinks") without any usage so it's safe to be removed. Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 365a1cc0a3c3..8ad7e7e38d18 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6794,8 +6794,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + struct extent_buffer *leaf; + int slot; struct btrfs_key search_key; struct inode *inode; u64 ino; -- cgit v1.2.3 From ed3764f726b24099396734aa1a5b41ae7e0fac60 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 27 Jun 2023 15:34:31 +0800 Subject: btrfs: add comments for btrfs_map_block() The function btrfs_map_block() is a critical part of the btrfs storage layer, which handles mapping of logical ranges to physical ranges. Thus it's better to have some basic explanation, especially on the following points: - Segment split by various boundaries As a continuous logical range may be split into different segments, due to various factors like zones and RAID0/5/6/10 boundaries. - The meaning of @mirror_num - The possible single stripe optimization - One deprecated parameter @need_raid_map Just explicitly mark it deprecated so we're aware of the problem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6aa9bf3661ac..f47a9c42431e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6219,6 +6219,45 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } +/* + * Map one logical range to one or more physical ranges. + * + * @length: (Mandatory) mapped length of this run. + * One logical range can be split into different segments + * due to factors like zones and RAID0/5/6/10 stripe + * boundaries. + * + * @bioc_ret: (Mandatory) returned btrfs_io_context structure. + * which has one or more physical ranges (btrfs_io_stripe) + * recorded inside. + * Caller should call btrfs_put_bioc() to free it after use. + * + * @smap: (Optional) single physical range optimization. + * If the map request can be fulfilled by one single + * physical range, and this is parameter is not NULL, + * then @bioc_ret would be NULL, and @smap would be + * updated. + * + * @mirror_num_ret: (Mandatory) returned mirror number if the original + * value is 0. + * + * Mirror number 0 means to choose any live mirrors. + * + * For non-RAID56 profiles, non-zero mirror_num means + * the Nth mirror. (e.g. mirror_num 1 means the first + * copy). + * + * For RAID56 profile, mirror 1 means rebuild from P and + * the remaining data stripes. + * + * For RAID6 profile, mirror > 2 means mark another + * data/P stripe error and rebuild from the remaining + * stripes.. + * + * @need_raid_map: (Used only for integrity checker) whether the map wants + * a full stripe map (including all data and P/Q stripes) + * for RAID56. Should always be 1 except integrity checker. + */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, -- cgit v1.2.3 From 3a3c7a7f6506338bcc4ebe6c5a2cbfec016ce50c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 28 Jun 2023 16:11:15 +0800 Subject: btrfs: raid56: remove unused BTRFS_RBIO_REBUILD_MISSING Commit aca43fe839e4 ("btrfs: remove unused raid56 functions which were dedicated for scrub") removed the special handling of RAID56 scrub for missing device. As scrub goes full mirror_num based recovery, that means if it hits a missing device in RAID56, it would just try the next mirror, which would go through the BTRFS_RBIO_READ_REBUILD operation. This means there is no longer any use of BTRFS_RBIO_REBUILD_MISSING operation and we can safely remove it. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 17 +++++------------ fs/btrfs/raid56.h | 1 - 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0249ea52bb80..35fd6e152731 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -584,8 +584,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - last->operation == BTRFS_RBIO_READ_REBUILD) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; @@ -784,10 +783,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, recover_rbio_work_locked); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); @@ -1698,8 +1694,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1763,8 +1758,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1897,8 +1891,7 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e84c9c9293f..45e6ff78316f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -14,7 +14,6 @@ enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { -- cgit v1.2.3 From 070bb0011ccfd00c7de25dce2489fbe9cce5ba44 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 20 Jun 2023 16:55:09 +0800 Subject: btrfs: sysfs: show if ACL support has been compiled in ACL support depends on the compile-time configuration option CONFIG_BTRFS_FS_POSIX_ACL. Prior to mounting a btrfs filesystem, it is not possible to determine whether ACL support has been compiled in. To address this, add a sysfs interface, /sys/fs/btrfs/features/acl, and check for ACL support in the system's btrfs. To determine ACL support: Return 0 indicates ACL is not supported: $ cat /sys/fs/btrfs/features/acl 0 Return 1 indicates ACL is supported: $ cat /sys/fs/btrfs/features/acl 1 IMO, this is a better approach, so that we also know if kernel is older. On an older kernel $ ls /sys/fs/btrfs/features/acl ls: cannot access '/sys/fs/btrfs/features/acl': No such file or directory mount a btrfs filesystem $ cat /proc/self/mounts | grep btrfs | grep -q noacl $ echo $? 0 Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25294e624851..b1d1ac25237b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -414,6 +414,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); +} +BTRFS_ATTR(static_feature, acl, acl_show); + /* * Features which only depend on kernel version. * @@ -421,6 +427,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), -- cgit v1.2.3 From dbb6ecb328cb5f974ff0a04ad31af332be5a122f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jun 2023 15:06:05 +0800 Subject: btrfs: tracepoints: simplify raid56 events After commit 6bfd0133bee2 ("btrfs: raid56: switch scrub path to use a single function"), the raid56 implementation no longer uses different endio functions for RMW/recover/scrub. All read operations end in submit_read_wait_bio_list(), while all write operations end in submit_write_bios(). This means quite some trace events are out-of-date and no longer utilized. This patch would unify the trace events into just two: - trace_raid56_read() Replaces trace_raid56_read_partial(), trace_raid56_scrub_read() and trace_raid56_scrub_read_recover(). - trace_raid56_write() Replaces trace_raid56_write_stripe() and trace_raid56_scrub_write_stripe(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 8 ++++---- include/trace/events/btrfs.h | 29 ++--------------------------- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 35fd6e152731..f97fc62fbab2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1513,11 +1513,11 @@ static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_scrub_read_recover_enabled()) { + if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } @@ -2191,11 +2191,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; - if (trace_raid56_write_stripe_enabled()) { + if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a8206f5332e9..a76a279c5f0f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2482,7 +2482,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio, __entry->offset, __entry->opf, __entry->physical, __entry->len) ); -DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, +DEFINE_EVENT(btrfs_raid56_bio, raid56_read, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), @@ -2490,32 +2490,7 @@ DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, TP_ARGS(rbio, bio, trace_info) ); -DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover, +DEFINE_EVENT(btrfs_raid56_bio, raid56_write, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), -- cgit v1.2.3 From 28f60894902ea40ed9a74cbbfbe812b979d9aa66 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:45 +0100 Subject: btrfs: update documentation for add_new_free_space() The documentation for add_new_free_space() is stale and no longer correct: 1) It's no longer used only when caching a block group. It's also called when creating a block group (btrfs_make_block_group()), when reading a block group at mount time (read_one_block_group()) and when reading the free space tree for a block group (typically the first time we attempt to allocate from the block group); 2) It has nothing to do with pinned extents. It only deals with the excluded extents io tree, which is used to track the locations of super blocks in order to make sure we never add the location of a super block to the free space cache of a block group. So update the documention and also add a description of the arguments and return values. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 82324c327a50..5ea515fc8c16 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -504,10 +504,17 @@ static void fragment_free_space(struct btrfs_block_group *block_group) #endif /* - * This is only called by btrfs_cache_block_group, since we could have freed - * extents we need to check the pinned_extents for any extents that can't be - * used yet since their free space will be released as soon as the transaction - * commits. + * Add a free space range to the in memory free space cache of a block group. + * This checks if the range contains super block locations and any such + * locations are not added to the free space cache. + * + * @block_group: The target block group. + * @start: Start offset of the range. + * @end: End offset of the range (exclusive). + * @total_added_ret: Optional pointer to return the total amount of space + * added to the block group's free space cache. + * + * Returns 0 on success or < 0 on error. */ int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, u64 *total_added_ret) -- cgit v1.2.3 From 3b9f0995d819157f1d2b37ad291ad6c7eb8b762f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:46 +0100 Subject: btrfs: rename add_new_free_space() to btrfs_add_new_free_space() Since add_new_free_space() is exported, used outside block-group.c, rename it to include the 'btrfs_' prefix. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 20 ++++++++++---------- fs/btrfs/block-group.h | 4 ++-- fs/btrfs/free-space-tree.c | 13 ++++++++----- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5ea515fc8c16..7bd1eee9367b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -516,8 +516,8 @@ static void fragment_free_space(struct btrfs_block_group *block_group) * * Returns 0 on success or < 0 on error. */ -int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, - u64 *total_added_ret) +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, + u64 end, u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size; @@ -806,8 +806,8 @@ next: key.type == BTRFS_METADATA_ITEM_KEY) { u64 space_added; - ret = add_new_free_space(block_group, last, key.objectid, - &space_added); + ret = btrfs_add_new_free_space(block_group, last, + key.objectid, &space_added); if (ret) goto out; total_found += space_added; @@ -828,9 +828,9 @@ next: path->slots[0]++; } - ret = add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); + ret = btrfs_add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; @@ -2326,8 +2326,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - ret = add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + ret = btrfs_add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -2774,7 +2774,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); + ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); if (ret) { btrfs_put_block_group(cache); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 74b61e663028..2bdbcb834f95 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -291,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -int add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end, u64 *total_added_ret); +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f169378e2ca6..c0e734082dcc 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1517,8 +1517,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } else if (prev_bit == 1 && bit == 0) { u64 space_added; - ret = add_new_free_space(block_group, extent_start, - offset, &space_added); + ret = btrfs_add_new_free_space(block_group, + extent_start, + offset, + &space_added); if (ret) goto out; total_found += space_added; @@ -1533,7 +1535,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - ret = add_new_free_space(block_group, extent_start, end, NULL); + ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) goto out; extent_count++; @@ -1590,8 +1592,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - ret = add_new_free_space(block_group, key.objectid, - key.objectid + key.offset, &space_added); + ret = btrfs_add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, + &space_added); if (ret) goto out; total_found += space_added; -- cgit v1.2.3 From aec5716c3e51a34ae8d98d287613b219ba267105 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:47 +0100 Subject: btrfs: make btrfs_destroy_marked_extents() return void Currently btrfs_destroy_marked_extents() is returning the value of the last call to find_first_extent_bit(), which returns a value of 1 meaning no more ranges found the dirty pages io tree. This value is useless to the single caller of btrfs_destroy_marked_extents(), which ignores any return value from btrfs_destroy_marked_extents(). This is because it's only used in the transaction abort path, where we can't even deal with any errors since we are in a critical situation already and cleanup of resources is done in a best effort fashion. So make btrfs_destroy_marked_extents() return void. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9a2c5446c18..1957bb1dfb02 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4716,21 +4716,16 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delalloc_root_lock); } -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark) +static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, + int mark) { - int ret; struct extent_buffer *eb; u64 start = 0; u64 end; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL); - if (ret) - break; - + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); @@ -4746,8 +4741,6 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } - - return ret; } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 46d81ebd4a521797481ef193397c099eaf97e83f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:48 +0100 Subject: btrfs: make btrfs_destroy_pinned_extent() return void Currently btrfs_destroy_pinned_extent() is always returning 0 no matter what and its caller ignores its return value (as well everything up in the call chain). This is because this is called in the transaction abort path, where we can't even deal with any errors since we are in a critical situation already and cleanup of resources is done in a best effort fashion. So make btrfs_destroy_pinned_extent() return void. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1957bb1dfb02..f7b678b94666 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4743,12 +4743,11 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, } } -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *unpin) +static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, + struct extent_io_tree *unpin) { u64 start; u64 end; - int ret; while (1) { struct extent_state *cached_state = NULL; @@ -4760,9 +4759,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -4773,8 +4771,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } - - return 0; } static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) -- cgit v1.2.3 From e5860f8207ed341502ecee80688cef93e367b615 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:49 +0100 Subject: btrfs: make find_first_extent_bit() return a boolean Currently find_first_extent_bit() returns a 0 if it found a range in the given io tree and 1 if it didn't find any. There's no need to return any errors, so make the return value a boolean and invert the logic to make more sense: return true if it found a range and false if it didn't find any range. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++++----- fs/btrfs/dev-replace.c | 6 +++--- fs/btrfs/disk-io.c | 10 +++++----- fs/btrfs/extent-io-tree.c | 14 +++++++------- fs/btrfs/extent-io-tree.h | 6 +++--- fs/btrfs/extent-tree.c | 5 ++--- fs/btrfs/free-space-cache.c | 7 +++---- fs/btrfs/relocation.c | 10 ++++++---- fs/btrfs/transaction.c | 8 ++++---- fs/btrfs/volumes.c | 6 +++--- 10 files changed, 40 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7bd1eee9367b..9479c9e1507d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -527,11 +527,10 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - ret = find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) + if (!find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL)) break; if (extent_start <= start) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5f10965fd72b..fff22ed55c42 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -792,9 +792,9 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (!find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { + while (find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { ret = set_extent_bit(&tgtdev->alloc_state, found_start, found_end, CHUNK_ALLOCATED, NULL); if (ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f7b678b94666..e7a096a8c0f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4228,7 +4228,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (!find_first_extent_bit(&trans->dirty_pages, cur, + while (find_first_extent_bit(&trans->dirty_pages, cur, &found_start, &found_end, EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; @@ -4724,8 +4724,8 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); @@ -4759,8 +4759,8 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a2315a4b8b75..ff8e117a1ace 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -831,15 +831,15 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * * Note: If there are multiple bits set in @bits, any of them will match. * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. + * Return true if we find something, and update @start_ret and @end_ret. + * Return false if we found nothing. */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; - int ret = 1; + bool ret = false; spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -863,7 +863,7 @@ got_it: cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; - ret = 0; + ret = true; } out: spin_unlock(&tree->lock); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index fbd3b275ab1c..28c23a23d121 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -182,9 +182,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396a9afa403..ab7b588baa7e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2751,9 +2751,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 880800418075..bfc01352351c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1219,10 +1219,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL); - if (ret) + if (!find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 46c3c1d57266..8ff0e9dcf85c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3498,6 +3498,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, last = rc->block_group->start + rc->block_group->length; while (1) { + bool block_found; + cond_resched(); if (rc->search_start >= last) { ret = 1; @@ -3548,11 +3550,11 @@ next: goto next; } - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); - if (ret == 0 && start <= key.objectid) { + if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 91b6c2fdc420..4bb9716ad24a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1060,8 +1060,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; err = convert_extent_bit(dirty_pages, start, end, @@ -1114,8 +1114,8 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f47a9c42431e..6d3aa26186a3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1424,9 +1424,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (!find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, -- cgit v1.2.3 From b1c8f527feb858927f50a136d13196f107c63631 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:50 +0100 Subject: btrfs: open code trivial btrfs_add_excluded_extent() The code for btrfs_add_excluded_extent() is trivial, it's just a set_extent_bit() call. However it's defined in extent-tree.c but it is only used (twice) in block-group.c. So open code it in block-group.c, reducing the need to export a trivial function. Also since the only caller btrfs_add_excluded_extent() is prepared to deal with errors, stop ignoring errors from the set_extent_bit() call. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 ++++++---- fs/btrfs/extent-tree.c | 9 --------- fs/btrfs/extent-tree.h | 2 -- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9479c9e1507d..3cbd80b6f765 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2104,8 +2104,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = btrfs_add_excluded_extent(fs_info, cache->start, - stripe_len); + ret = set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_UPTODATE, NULL); if (ret) return ret; } @@ -2131,8 +2132,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, logical[nr], - len); + ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], + logical[nr] + len - 1, + EXTENT_UPTODATE, NULL); if (ret) { kfree(logical); return ret; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ab7b588baa7e..1a0fb156f602 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -69,15 +69,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) return (cache->flags & bits) == bits; } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bit(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE, NULL); - return 0; -} - void btrfs_free_excluded_extents(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 429d5c570061..3b2f265f4653 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -96,8 +96,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 98b5a8fd2af7005ba5a1a5cd9079c98266be7499 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jun 2023 16:03:51 +0100 Subject: btrfs: move btrfs_free_excluded_extents() into block-group.c The function btrfs_free_excluded_extents() is only used by block-group.c, so move it into block-group.c and make it static. Also removed unnecessary variables that are used only once. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 ++++++ fs/btrfs/extent-tree.c | 12 ------------ fs/btrfs/extent-tree.h | 1 - 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3cbd80b6f765..a127865f49f9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -835,6 +835,12 @@ out: return ret; } +static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) +{ + clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_UPTODATE); +} + static noinline void caching_thread(struct btrfs_work *work) { struct btrfs_block_group *block_group; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1a0fb156f602..d674e421ea75 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -69,18 +69,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) return (cache->flags & bits) == bits; } -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 start, end; - - start = cache->start; - end = start + cache->length - 1; - - clear_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 3b2f265f4653..b9e148adcd28 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -96,7 +96,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, -- cgit v1.2.3 From 16c3a47648383efa625f771999182db9f34fb8d9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 26 Jun 2023 17:55:25 +0800 Subject: btrfs: deprecate integrity checker feature The integrity checker feature needs to be enabled at compile time (BTRFS_FS_CHECK_INTEGRITY) and then enabled by mount options check_int*. Although it provides some unique features which can not be provided by any other sanity checks like tree-checker, it does not only have high CPU and memory overhead, but is also a maintenance burden. For example, it's the only caller of btrfs_map_block() with @need_raid_map = 0. Considering most btrfs developers are not even testing this feature, I'm here to propose deprecation of this feature. For now only warning messages will be printed, the feature itself would still work. Removal time has been set to 6.7 release. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 4 +++- fs/btrfs/super.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 66fa9ab2c046..3282adc84d52 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -49,9 +49,11 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + bool "Btrfs with integrity check tool compiled in (DEPRECATED)" depends on BTRFS_FS help + This feature has been deprecated and will be removed in 6.7. + Adds code that examines all block write requests (including writes of the super block). The goal is to verify that the state of the filesystem on disk is always consistent, i.e., diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1dd172d8d5b..cffdd6f7f8e8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -709,12 +709,16 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; @@ -727,6 +731,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; } info->check_integrity_print_mask = intarg; + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); break; -- cgit v1.2.3 From 98efb4eb310d0d72e663924adc7b5b6e14813f9d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 13 Jul 2023 16:10:26 +0200 Subject: btrfs: use helper sizeof_field in struct accessors There's a helper for obtaining size of a struct member, we can use it instead of open coding the pointer magic. Reviewed-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ceadfc5d6c66..8cfc8214109c 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -34,13 +36,13 @@ static inline void put_unaligned_le8(u8 val, void *p) read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ @@ -62,25 +64,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -111,17 +113,14 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } -- cgit v1.2.3 From 7b365a2a3d31cc523ad24f2147285c3707297ce8 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Mon, 17 Jul 2023 15:16:22 +0800 Subject: btrfs: use folio_next_index() helper in extent_write_cache_pages Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index(). Reviewed-by: Johannes Thumshirn Signed-off-by: Minjie Du Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90ad3006ef3a..91fba78c9996 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2150,7 +2150,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - done_index = folio->index + folio_nr_pages(folio); + done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or -- cgit v1.2.3 From 17353a3447927455b8242aa371488b7af01c5e14 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Jun 2023 14:39:55 +0800 Subject: btrfs: scrub: remove unused btrfs_path in scrub_simple_mirror() The @path in scrub_simple_mirror() is no longer utilized after commit e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure"). Before that commit, we call find_first_extent_item() directly, which needs a path and that path can be reused. But after that switch commit, the extent search is done inside queue_scrub_stripe(), which will no longer accept a path from outside. So the @path variable can be safely removed. Reviewed-by: Christoph Hellwig Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ remove the stale comment ] Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fe1c5bc7f601..8381174bda15 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1957,16 +1957,12 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; - /* An artificial limit, inherit from old scrub behavior */ - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { u64 cur_physical = physical + cur_logical - logical_start; @@ -2010,7 +2006,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Don't hold CPU for too long time */ cond_resched(); } - btrfs_release_path(&path); return ret; } -- cgit v1.2.3 From b2cc440058baf3f4783a687a2caa35b31a1da553 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jun 2023 08:13:24 +0200 Subject: btrfs: simplify the no-bioc fast path condition in btrfs_map_block nr_alloc_stripes can't be one if we are writing to a replacement device, as it is incremented for that case right above. Remove the duplicate checks. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6d3aa26186a3..8af0e54099c2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6432,9 +6432,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (smap && num_alloc_stripes == 1 && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || - !dev_replace->tgtdev)) { + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) *mirror_num_ret = mirror_num; -- cgit v1.2.3 From 52ea5bfbfa6d5432ed46e0a18b9e5ca4e4f91852 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 9 Jul 2023 15:08:18 +0800 Subject: btrfs: move eb subpage preallocation out of the loop Initially we preallocate btrfs_subpage structure in the main loop of alloc_extent_buffer(). But later commit fbca46eb46ec ("btrfs: make nodesize >= PAGE_SIZE case to reuse the non-subpage routine") has made sure we only go subpage routine if our nodesize is smaller than PAGE_SIZE. This means for that case, we only need to allocate the subpage structure once anyway. So this patch would make the preallocation out of the main loop. This would slightly reduce the workload when we hold the page lock, and make code a little easier to read. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91fba78c9996..662d991fb45f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3559,6 +3559,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; int uptodate = 1; int ret; @@ -3595,36 +3596,30 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++, index++) { - struct btrfs_subpage *prealloc = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock nor page lock hold. + * + * The memory will be freed by attach_extent_buffer_page() or freed + * manually if we exit earlier. + */ + if (fs_info->nodesize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + exists = ERR_CAST(prealloc); + goto free_eb; + } + } + + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); + btrfs_free_subpage(prealloc); goto free_eb; } - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock hold. The memory will be - * freed by attach_extent_buffer_page() or freed manually if - * we exit earlier. - * - * Although we have ensured one subpage eb can only have one - * page, but it may change in the future for 16K page size - * support, so we still preallocate the memory in the loop. - */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); - if (IS_ERR(prealloc)) { - ret = PTR_ERR(prealloc); - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; - } - } - spin_lock(&mapping->private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { -- cgit v1.2.3 From 75d305c55b130bae5077ec262440240fec4e6281 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 14 Jul 2023 10:18:19 -0500 Subject: btrfs: remove duplicate free_async_extent_pages() on reservation error While performing compressed writes, if the extent reservation fails, the async extent pages are first freed in the error check for return value ret, and then again at out_free label. Remove the first call to free_async_extent_pages(). Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa090b0b5d29..5508597be614 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1217,7 +1217,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { - free_async_extent_pages(async_extent); /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for -- cgit v1.2.3 From e28b02118b94e42be3355458a2406c6861e2dd32 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 21 Jul 2023 09:02:06 -0700 Subject: btrfs: free qgroup rsv on io failure If we do a write whose bio suffers an error, we will never reclaim the qgroup reserved space for it. We allocate the space in the write_iter codepath, then release the reservation as we allocate the ordered extent, but we only create a delayed ref if the ordered extent finishes. If it has an error, we simply leak the rsv. This is apparent in running any error injecting (dmerror) fstests like btrfs/146 or btrfs/160. Such tests fail due to dmesg on umount complaining about the leaked qgroup data space. When we clean up other aspects of space on failed ordered_extents, also free the qgroup rsv. Reviewed-by: Josef Bacik CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5508597be614..8e53df3516c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3358,6 +3358,13 @@ out: btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); } } -- cgit v1.2.3 From a6496849671a5bc9218ecec25a983253b34351b1 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 21 Jul 2023 09:02:07 -0700 Subject: btrfs: fix start transaction qgroup rsv double free btrfs_start_transaction reserves metadata space of the PERTRANS type before it identifies a transaction to start/join. This allows flushing when reserving that space without a deadlock. However, it results in a race which temporarily breaks qgroup rsv accounting. T1 T2 start_transaction do_stuff start_transaction qgroup_reserve_meta_pertrans commit_transaction qgroup_free_meta_all_pertrans hit an error starting txn goto reserve_fail qgroup_free_meta_pertrans (already freed!) The basic issue is that there is nothing preventing another commit from committing before start_transaction finishes (in fact sometimes we intentionally wait for it) so any error path that frees the reserve is at risk of this race. While this exact space was getting freed anyway, and it's not a huge deal to double free it (just a warning, the free code catches this), it can result in incorrectly freeing some other pertrans reservation in this same reservation, which could then lead to spuriously granting reservations we might not have the space for. Therefore, I do believe it is worth fixing. To fix it, use the existing prealloc->pertrans conversion mechanism. When we first reserve the space, we reserve prealloc space and only when we are sure we have a transaction do we convert it to pertrans. This way any racing commits do not blow away our reservation, but we still get a pertrans reservation that is freed when _this_ transaction gets committed. This issue can be reproduced by running generic/269 with either qgroups or squotas enabled via mkfs on the scratch device. Reviewed-by: Josef Bacik CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4bb9716ad24a..815f61d6b506 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -591,8 +591,13 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, u64 delayed_refs_bytes = 0; qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, - enforce_qgroups); + /* + * Use prealloc for now, as there might be a currently running + * transaction that could free this reserved space prematurely + * by committing. + */ + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, + enforce_qgroups, false); if (ret) return ERR_PTR(ret); @@ -705,6 +710,14 @@ again: h->reloc_reserved = reloc_reserved; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + got_it: if (!current->journal_info) current->journal_info = h; @@ -752,7 +765,7 @@ alloc_fail: btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, num_bytes, NULL); reserve_fail: - btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } -- cgit v1.2.3 From ba9145add52422779935f6d1b645a31a50c02af2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:22 +0200 Subject: btrfs: pass a flags argument to cow_file_range The int used as bool unlock is not a very good way to describe the behavior, and the next patch will have to add another behavior modifier. We'll do that by two bool parameters instead of adding bit flags. Now specifies that the pages should always be kept locked. This is the inverse of the old unlock argument. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ switch flags to bool ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e53df3516c2..820444fbb564 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -124,11 +124,12 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); + static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset); + unsigned long *nr_written, u64 *done_offset, + bool keep_locked); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -1148,7 +1149,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0, NULL); + &nr_written, NULL, true); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) return 0; @@ -1361,25 +1362,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - * - * When unlock == 1, we unlock the pages in successfully allocated regions. - * When unlock == 0, we leave them locked for writing them out. + * When this function fails, it unlocks all pages except @locked_page. * - * However, we unlock all the pages except @locked_page in case of failure. + * When this function successfully creates an inline extent, it sets page_started + * to 1 and unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). * - * In summary, page locking state will be as follow: + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: * - * - page_started == 1 (return value) - * - All the pages are unlocked. IO is started. - * - Note that this can happen only on success - * - unlock == 1 - * - All the pages except @locked_page are unlocked in any case - * - unlock == 0 - * - On success, all the pages are locked for writing out them - * - On failure, all the pages except @locked_page are unlocked + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1390,8 +1384,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset) + unsigned long *nr_written, u64 *done_offset, + bool keep_locked) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1557,7 +1551,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ - page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, @@ -1626,10 +1620,10 @@ out_unlock: * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * - * However, in case of unlock == 0, we still need to unlock the pages + * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) { + if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, @@ -1835,7 +1829,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0, &done_offset); + nr_written, &done_offset, true); if (ret && ret != -EAGAIN) return ret; @@ -1955,7 +1949,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1, NULL); + nr_written, NULL, false); } struct can_nocow_file_extent_args { @@ -2432,7 +2426,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written, wbc); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); + page_started, nr_written, NULL, false); out: ASSERT(ret <= 0); -- cgit v1.2.3 From 53ffb30a784d30db4d4cd074b4da46fa2472ab60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:23 +0200 Subject: btrfs: don't create inline extents in fallback_to_cow For NOCOW files, run_delalloc_nocow can still fall back to COW allocations when required and calls to fallback_to_cow helper for that. For such an allocation we can have multiple ordered_extents for existing extents that NOCOW overwrites and new allocations that fallback_to_cow creates. If one of the new extents is an inline extent, the writepages could would have to avoid normal page writeback for them as indicated by the page_started return argument, which run_delalloc_nocow can't return. Fix this by never creating inline extents from fallback_to_cow. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 820444fbb564..aa1cda22a23a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -129,7 +129,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, u64 *done_offset, - bool keep_locked); + bool keep_locked, bool no_inline); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -1149,7 +1149,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, NULL, true); + &nr_written, NULL, true, false); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) return 0; @@ -1385,7 +1385,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, u64 *done_offset, - bool keep_locked) + bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1424,7 +1424,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); @@ -1829,7 +1829,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, &done_offset, true); + nr_written, &done_offset, true, false); if (ret && ret != -EAGAIN) return ret; @@ -1887,15 +1887,17 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, unsigned long *nr_written) + const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + int page_started = 0; + unsigned long nr_written; u64 range_start = start; u64 count; + int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was @@ -1948,8 +1950,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, NULL, false); + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, &page_started, + &nr_written, NULL, false, true); + ASSERT(!page_started); + return ret; } struct can_nocow_file_extent_args { @@ -2098,9 +2107,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, - unsigned long *nr_written) + const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -2268,8 +2275,7 @@ out_check: */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written); + cow_start, found_key.offset - 1); if (ret) goto error; cow_start = (u64)-1; @@ -2350,8 +2356,7 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end); if (ret) goto error; } @@ -2410,8 +2415,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * preallocated inodes. */ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, nr_written); + ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } @@ -2426,7 +2430,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written, wbc); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, NULL, false); + page_started, nr_written, NULL, false, false); out: ASSERT(ret <= 0); -- cgit v1.2.3 From ef4e88e6a55babd825aed2940a72a0562f49ca7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:24 +0200 Subject: btrfs: split page locking out of __process_pages_contig There is a lot of complexity in __process_pages_contig to deal with the PAGE_LOCK case that can return an error unlike all the other actions. Open code the page iteration for page locking in lock_delalloc_pages and remove all the now unused code from __process_pages_contig. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 151 ++++++++++++++++++++------------------------------- fs/btrfs/extent_io.h | 1 - 2 files changed, 59 insertions(+), 93 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 662d991fb45f..7925c8ba3571 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -197,18 +197,9 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) } } -/* - * Process one page for __process_pages_contig(). - * - * Return >0 if we hit @page == @locked_page. - * Return 0 if we updated the page status. - * Return -EGAIN if the we need to try again. - * (For PAGE_LOCK case but got dirty page or page not belong to mapping) - */ -static int process_one_page(struct btrfs_fs_info *fs_info, - struct address_space *mapping, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_page(struct btrfs_fs_info *fs_info, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) { u32 len; @@ -224,94 +215,36 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_page_clamp_clear_writeback(fs_info, page, start, len); - if (page == locked_page) - return 1; - - if (page_ops & PAGE_LOCK) { - int ret; - - ret = btrfs_page_start_writer_lock(fs_info, page, start, len); - if (ret) - return ret; - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, len); - return -EAGAIN; - } - } - if (page_ops & PAGE_UNLOCK) + if (page != locked_page && (page_ops & PAGE_UNLOCK)) btrfs_page_end_writer_lock(fs_info, page, start, len); - return 0; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - u64 start, u64 end, unsigned long page_ops, - u64 *processed_end) +static void __process_pages_contig(struct address_space *mapping, + struct page *locked_page, u64 start, u64 end, + unsigned long page_ops) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long pages_processed = 0; struct folio_batch fbatch; - int err = 0; int i; - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(processed_end && *processed_end == start); - } - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; found_folios = filemap_get_folios_contig(mapping, &index, end_index, &fbatch); - - if (found_folios == 0) { - /* - * Only if we're going to lock these pages, we can find - * nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - for (i = 0; i < found_folios; i++) { - int process_ret; struct folio *folio = fbatch.folios[i]; - process_ret = process_one_page(fs_info, mapping, - &folio->page, locked_page, page_ops, - start, end); - if (process_ret < 0) { - err = -EAGAIN; - folio_batch_release(&fbatch); - goto out; - } - pages_processed += folio_nr_pages(folio); + + process_one_page(fs_info, &folio->page, locked_page, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } -out: - if (err && processed_end) { - /* - * Update @processed_end. I know this is awful since it has - * two different return value patterns (inclusive vs exclusive). - * - * But the exclusive pattern is necessary if @start is 0, or we - * underflow and check against processed_end won't work as - * expected. - */ - if (pages_processed) - *processed_end = min(end, - ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); - else - *processed_end = start; - } - return err; } static noinline void __unlock_for_delalloc(struct inode *inode, @@ -326,29 +259,63 @@ static noinline void __unlock_for_delalloc(struct inode *inode, return; __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK, NULL); + PAGE_UNLOCK); } static noinline int lock_delalloc_pages(struct inode *inode, struct page *locked_page, - u64 delalloc_start, - u64 delalloc_end) + u64 start, + u64 end) { - unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long end_index = delalloc_end >> PAGE_SHIFT; - u64 processed_end = delalloc_start; - int ret; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + u64 processed_end = start; + struct folio_batch fbatch; - ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, - delalloc_end, PAGE_LOCK, &processed_end); - if (ret == -EAGAIN && processed_end > delalloc_start) - __unlock_for_delalloc(inode, locked_page, delalloc_start, - processed_end); - return ret; + folio_batch_init(&fbatch); + while (index <= end_index) { + unsigned int found_folios, i; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + if (found_folios == 0) + goto out; + + for (i = 0; i < found_folios; i++) { + struct page *page = &fbatch.folios[i]->page; + u32 len = end + 1 - start; + + if (page == locked_page) + continue; + + if (btrfs_page_start_writer_lock(fs_info, page, start, + len)) + goto out; + + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, + len); + goto out; + } + + processed_end = page_offset(page) + PAGE_SIZE - 1; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return 0; +out: + folio_batch_release(&fbatch); + if (processed_end > start) + __unlock_for_delalloc(inode, locked_page, start, processed_end); + return -EAGAIN; } /* @@ -467,7 +434,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops, NULL); + start, end, page_ops); } static bool btrfs_verify_page(struct page *page, u64 start) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5fae3a7d911..285754154fdc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -40,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_LOCK), }; /* -- cgit v1.2.3 From 6648cedd86135db197410e56b5372b2945f2b311 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:25 +0200 Subject: btrfs: remove btrfs_writepage_endio_finish_ordered btrfs_writepage_endio_finish_ordered is a small wrapper around btrfs_mark_ordered_io_finished that just changs the argument passing slightly, and adds a tracepoint. Move the tracpoint to btrfs_mark_ordered_io_finished, which means it now also covers the error handling in btrfs_cleanup_ordered_extent and switch all callers to just call btrfs_mark_ordered_io_finished directly. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 --- fs/btrfs/extent_io.c | 17 ++++++++--------- fs/btrfs/inode.c | 9 --------- fs/btrfs/ordered-data.c | 4 ++++ 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d47a927b3504..90e60ad9db62 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -501,9 +501,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7925c8ba3571..6a7128318fbe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -471,17 +471,15 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) struct btrfs_inode *inode; const bool uptodate = (err == 0); int ret = 0; + u32 len = end + 1 - start; + ASSERT(end + 1 - start <= U32_MAX); ASSERT(page && page->mapping); inode = BTRFS_I(page->mapping->host); - btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, len, uptodate); if (!uptodate) { const struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; btrfs_page_clear_uptodate(fs_info, page, start, len); ret = err < 0 ? err : -EIO; @@ -1349,6 +1347,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { + u32 len = end - cur + 1; u64 disk_bytenr; u64 em_end; u64 dirty_range_start = cur; @@ -1356,8 +1355,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, true); + btrfs_mark_ordered_io_finished(inode, page, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1366,7 +1365,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); + btrfs_page_clear_dirty(fs_info, page, cur, len); break; } @@ -1377,7 +1376,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa1cda22a23a..f231e7623f18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3388,15 +3388,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate) -{ - trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); -} - /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a629532283bc..109e80ed25b6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -410,6 +410,10 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, unsigned long flags; u64 cur = file_offset; + trace_btrfs_writepage_end_io_hook(inode, file_offset, + file_offset + num_bytes - 1, + uptodate); + spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; -- cgit v1.2.3 From 9783e4deed7291996459858a1a16f41a8988dd60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:26 +0200 Subject: btrfs: remove end_extent_writepage end_extent_writepage is a small helper that combines a call to btrfs_mark_ordered_io_finished with conditional error-only calls to btrfs_page_clear_uptodate and mapping_set_error with a somewhat unfortunate calling convention that passes and inclusive end instead of the len expected by the underlying functions. Remove end_extent_writepage and open code it in the 4 callers. Out of those two already are error-only and thus don't need the extra conditional, and one already has the mapping_set_error, so a duplicate call can be avoided. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 44 +++++++++++++++----------------------------- fs/btrfs/extent_io.h | 2 -- fs/btrfs/inode.c | 42 ++++++++++++++++++++++-------------------- 3 files changed, 37 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6a7128318fbe..b99081a407a4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -464,29 +464,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -/* lots and lots of room for performance fixes in the end_bio funcs */ - -void end_extent_writepage(struct page *page, int err, u64 start, u64 end) -{ - struct btrfs_inode *inode; - const bool uptodate = (err == 0); - int ret = 0; - u32 len = end + 1 - start; - - ASSERT(end + 1 - start <= U32_MAX); - ASSERT(page && page->mapping); - inode = BTRFS_I(page->mapping->host); - btrfs_mark_ordered_io_finished(inode, page, start, len, uptodate); - - if (!uptodate) { - const struct btrfs_fs_info *fs_info = inode->root->fs_info; - - btrfs_page_clear_uptodate(fs_info, page, start, len); - ret = err < 0 ? err : -EIO; - mapping_set_error(page->mapping, ret); - } -} - /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -1452,7 +1429,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -1496,8 +1472,13 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, page_start, page_end); + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, + PAGE_SIZE, !ret); + btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page, + page_start, PAGE_SIZE); + mapping_set_error(page->mapping, ret); + } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -2222,6 +2203,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + u32 cur_len = cur_end + 1 - cur; struct page *page; int nr = 0; @@ -2245,9 +2227,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, cur, cur_end); - btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + cur, cur_len, !ret); + btrfs_page_clear_uptodate(fs_info, page, cur, cur_len); + mapping_set_error(page->mapping, ret); + } + btrfs_page_unlock_writer(fs_info, page, cur, cur_len); if (ret < 0) { found_error = true; first_error = ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 285754154fdc..8d11e17c0be9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -276,8 +276,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f231e7623f18..eb06c7c8d6c1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -424,11 +424,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { /* - * For locked page, we will call end_extent_writepage() on it - * in run_delalloc_range() for the error handling. That - * end_extent_writepage() function will call - * btrfs_mark_ordered_io_finished() to clear page Ordered and - * run the ordered extent accounting. + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting @@ -1158,11 +1157,16 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); - const u64 page_end = page_start + PAGE_SIZE - 1; set_page_writeback(locked_page); end_page_writeback(locked_page); - end_extent_writepage(locked_page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + btrfs_page_clear_uptodate(inode->root->fs_info, + locked_page, page_start, + PAGE_SIZE); + mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } return ret; @@ -2837,23 +2841,19 @@ struct btrfs_writepage_fixup { static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { - struct btrfs_writepage_fixup *fixup; + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page; - struct btrfs_inode *inode; - u64 page_start; - u64 page_end; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; - inode = fixup->inode; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; - /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. @@ -2946,10 +2946,12 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); + btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); -- cgit v1.2.3 From 84f262f00910996f6cfd27094b9bca9422c630d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:27 +0200 Subject: btrfs: reduce debug spam from submit_compressed_extents Move the printk that is supposed to help to debug failures in submit_one_async_extent into submit_one_async_extent and make it coniditonal on actually having an error condition instead of spamming the log unconditionally. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb06c7c8d6c1..bd62985c3520 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1179,11 +1179,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, return ret; } -static int submit_one_async_extent(struct btrfs_inode *inode, - struct async_chunk *async_chunk, - struct async_extent *async_extent, - u64 *alloc_hint) +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) { + struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1276,7 +1276,7 @@ done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); - return ret; + return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1290,7 +1290,13 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); - goto done; + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, +"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); } /* @@ -1300,28 +1306,15 @@ out_free: */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; - int ret = 0; while (!list_empty(&async_chunk->extents)) { - u64 extent_start; - u64 ram_size; - async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); - extent_start = async_extent->start; - ram_size = async_extent->ram_size; - ret = submit_one_async_extent(inode, async_chunk, async_extent, - &alloc_hint); - btrfs_debug(fs_info, -"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - inode->root->root_key.objectid, - btrfs_ino(inode), extent_start, ram_size, ret); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } } -- cgit v1.2.3 From ff20d6a4a9ed1b6e0ed78953830cff9fa8b9dbfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:28 +0200 Subject: btrfs: remove the return value from submit_uncompressed_range The return value from submit_uncompressed_range is ignored, and that's fine because the error reporting happens through the mapping and ordered_extent. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bd62985c3520..dad0a6113a04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1124,9 +1124,9 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -static int submit_uncompressed_range(struct btrfs_inode *inode, - struct async_extent *async_extent, - struct page *locked_page) +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1151,7 +1151,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, &nr_written, NULL, true, false); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) - return 0; + return; if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -1169,14 +1169,13 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return ret; + return; } /* All pages will be unlocked, including @locked_page */ wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); + extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); wbc_detach_inode(&wbc); - return ret; } static void submit_one_async_extent(struct async_chunk *async_chunk, @@ -1213,7 +1212,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* We have fall back to uncompressed write */ if (!async_extent->pages) { - ret = submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_page); goto done; } -- cgit v1.2.3 From 0835d1e66e7f22428bbc2ea56cb488a7759b51e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:29 +0200 Subject: btrfs: remove the return value from extent_write_locked_range The return value from extent_write_locked_range is ignored, and that's fine because the error reporting happens through the mapping and ordered_extent. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +++---------- fs/btrfs/extent_io.h | 4 ++-- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b99081a407a4..4981058d3877 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2180,11 +2180,10 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc) +void extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc) { bool found_error = false; - int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2234,20 +2233,14 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, mapping_set_error(page->mapping, ret); } btrfs_page_unlock_writer(fs_info, page, cur, cur_len); - if (ret < 0) { + if (ret < 0) found_error = true; - first_error = ret; - } next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - - if (found_error) - return first_error; - return ret; } int extent_writepages(struct address_space *mapping, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8d11e17c0be9..0312022bbf4b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -177,8 +177,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc); +void extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, -- cgit v1.2.3 From 2c73162d649670045415537dc746f7524cda1405 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:30 +0200 Subject: btrfs: improve the delalloc_to_write calculation in writepage_delalloc Currently writepage_delalloc adds to delalloc_to_write in every loop operation. That is not only more work than doing it once after the loop, but can also over-increment the counter due to rounding errors when a new loop iteration starts with an offset into a page. Add a new page_start variable instead of recaculation that value over and over, move the delalloc_to_write calculation out of the loop, use the DIV_ROUND_UP helper instead of open coding it and remove the pointless found local variable. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4981058d3877..148d289f70f0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1185,8 +1185,10 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc) { - const u64 page_end = page_offset(page) + PAGE_SIZE - 1; - u64 delalloc_start = page_offset(page); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; + u64 delalloc_start = page_start; + u64 delalloc_end = page_end; u64 delalloc_to_write = 0; /* How many pages are started by btrfs_run_delalloc_range() */ unsigned long nr_written = 0; @@ -1194,13 +1196,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int page_started = 0; while (delalloc_start < page_end) { - u64 delalloc_end = page_end; - bool found; - - found = find_lock_delalloc_range(&inode->vfs_inode, page, - &delalloc_start, - &delalloc_end); - if (!found) { + delalloc_end = page_end; + if (!find_lock_delalloc_range(&inode->vfs_inode, page, + &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } @@ -1209,14 +1207,15 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret) return ret; - /* - * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } + + /* + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE + */ + delalloc_to_write += + DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192; -- cgit v1.2.3 From c56cbe90599338c0aaa84cab0e99a9e8a8501a38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:31 +0200 Subject: btrfs: reduce the number of arguments to btrfs_run_delalloc_range Instead of a separate page_started argument that tells the callers that btrfs_run_delalloc_range already started writeback by itself, overload the return value with a positive 1 in additio to 0 and a negative error code to indicate that is has already started writeback, and remove the nr_written argument as that caller can calculate it directly based on the range, and in fact already does so for the case where writeback wasn't started yet. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +- fs/btrfs/extent_io.c | 30 +++++++++--------- fs/btrfs/inode.c | 85 +++++++++++++++++--------------------------------- 3 files changed, 43 insertions(+), 75 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 90e60ad9db62..bda1fdbba666 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -498,8 +498,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, struct writeback_control *wbc); + u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 148d289f70f0..08fb26d58172 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1190,10 +1190,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; - /* How many pages are started by btrfs_run_delalloc_range() */ - unsigned long nr_written = 0; - int ret; - int page_started = 0; + int ret = 0; while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1202,9 +1199,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, &nr_written, wbc); - if (ret) + delalloc_end, wbc); + if (ret < 0) return ret; delalloc_start = delalloc_end + 1; @@ -1216,6 +1214,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + + /* + * If btrfs_run_dealloc_range() already started I/O and unlocked + * the pages, we just need to account for them here. + */ + if (ret == 1) { + wbc->nr_to_write -= delalloc_to_write; + return 1; + } + if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192; @@ -1225,16 +1233,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ - if (page_started) { - /* - * We've unlocked the page, so we can't update the mapping's - * writeback index, just update nr_to_write. - */ - wbc->nr_to_write -= nr_written; - return 1; - } - return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dad0a6113a04..ada70af9c8a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -127,8 +127,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, u64 *done_offset, + u64 start, u64 end, u64 *done_offset, bool keep_locked, bool no_inline); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, @@ -1130,8 +1129,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; - unsigned long nr_written = 0; - int page_started = 0; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1147,10 +1144,9 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, * Also we call cow_file_range() with @unlock_page == 0, so that we * can directly submit them without interruption. */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, NULL, true, false); + ret = cow_file_range(inode, locked_page, start, end, NULL, true, false); /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) + if (ret == 1) return; if (ret < 0) { @@ -1360,8 +1356,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * * When this function fails, it unlocks all pages except @locked_page. * - * When this function successfully creates an inline extent, it sets page_started - * to 1 and unlocks all pages including locked_page and starts I/O on them. + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. * (In reality inline extents are limited to a single page, so locked_page is * the only page handled anyway). * @@ -1378,9 +1374,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, u64 *done_offset, + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; @@ -1440,9 +1435,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - *nr_written = *nr_written + - (end - start + PAGE_SIZE) / PAGE_SIZE; - *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by @@ -1452,11 +1444,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller - * can't use page_started to determine if it's an - * inline extent or a compressed extent. + * can't determine if it's an inline extent or a + * compressed extent. */ unlock_page(locked_page); - goto out; + return 1; } else if (ret < 0) { goto out_unlock; } @@ -1570,7 +1562,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } -out: return ret; out_drop_extent_cache: @@ -1721,10 +1712,8 @@ static noinline void async_cow_free(struct btrfs_work *work) } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -1806,34 +1795,25 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); - *nr_written += nr_pages; start = cur_end + 1; } - *page_started = 1; return true; } static noinline int run_delalloc_zoned(struct btrfs_inode *inode, struct page *locked_page, u64 start, - u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) + u64 end, struct writeback_control *wbc) { u64 done_offset = end; int ret; bool locked_page_done = false; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, &done_offset, true, false); + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); if (ret && ret != -EAGAIN) return ret; - if (*page_started) { - ASSERT(ret == 0); - return 0; - } - if (ret == 0) done_offset = end; @@ -1854,9 +1834,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, start = done_offset + 1; } - *page_started = 1; - - return 0; + return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, @@ -1889,8 +1867,6 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; - int page_started = 0; - unsigned long nr_written; u64 range_start = start; u64 count; int ret; @@ -1951,9 +1927,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, NULL, false, true); - ASSERT(!page_started); + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); return ret; } @@ -2389,15 +2364,14 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc) + u64 start, u64 end, struct writeback_control *wbc) { - int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; /* - * The range must cover part of the @locked_page, or the returned - * @page_started can confuse the caller. + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); @@ -2417,20 +2391,17 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, wbc, locked_page, start, - end, page_started, nr_written)) - goto out; + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written, wbc); + ret = run_delalloc_zoned(inode, locked_page, start, end, wbc); else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, NULL, false, false); + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); out: - ASSERT(ret <= 0); - if (ret) + if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; -- cgit v1.2.3 From 67583468086c2a12602b915184118c4863402b02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:32 +0200 Subject: btrfs: clean up the check for uncompressed ranges in submit_one_async_extent Instead of checking for a NULL !pages and explaining this with a cryptic comment, just check the compression type for BTRFS_COMPRESS_NONE to make the check self-explanatory. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ada70af9c8a2..27fdfd08a3ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1206,8 +1206,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } lock_extent(io_tree, start, end, NULL); - /* We have fall back to uncompressed write */ - if (!async_extent->pages) { + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { submit_uncompressed_range(inode, async_extent, locked_page); goto done; } -- cgit v1.2.3 From 3134508e47df8f721d1bc6a31980d10cb71f3f18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:33 +0200 Subject: btrfs: don't clear async_chunk->inode in async_cow_start Now that the ->inode check isn't needed in submit_compressed_extents any more, there is no reason to clear the field early. Always keep the inode around until the work item is finished and remove the special casing, and the counting of compressed extents in compress_file_range. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27fdfd08a3ef..76496b084f5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -830,7 +830,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, * are written in the same order that the flusher thread sent them * down. */ -static noinline int compress_file_range(struct async_chunk *async_chunk) +static noinline void compress_file_range(struct async_chunk *async_chunk) { struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -848,7 +848,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) int i; int will_compress; int compress_type = fs_info->compress_type; - int compressed_extents = 0; int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -1025,7 +1024,7 @@ cont: } kfree(pages); } - return 0; + return; } } @@ -1044,8 +1043,6 @@ cont: */ total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize <= total_in) { - compressed_extents++; - /* * The async work queues will take care of doing actual * allocation on disk for these compressed pages, and @@ -1061,7 +1058,7 @@ cont: cond_resched(); goto again; } - return compressed_extents; + return; } } if (pages) { @@ -1102,9 +1099,6 @@ cleanup_and_bail_uncompressed: extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); - compressed_extents++; - - return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -1655,15 +1649,9 @@ out_unlock: static noinline void async_cow_start(struct btrfs_work *work) { struct async_chunk *async_chunk; - int compressed_extents; async_chunk = container_of(work, struct async_chunk, work); - - compressed_extents = compress_file_range(async_chunk); - if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); - async_chunk->inode = NULL; - } + compress_file_range(async_chunk); } /* @@ -1700,8 +1688,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); - if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); -- cgit v1.2.3 From c15d8cf29551ea35a42b4c16cfd08388680be59c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:34 +0200 Subject: btrfs: merge async_cow_start and compress_file_range There is no good reason to have the simple async_cow_start wrapper, merge the argument conversion into the main compress_file_range function. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76496b084f5e..065d29efc7a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -814,24 +814,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, } /* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). + * Work queue call back to started compression on a file and pages. * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that the flusher thread sent them - * down. + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. */ -static noinline void compress_file_range(struct async_chunk *async_chunk) +static void compress_file_range(struct btrfs_work *work) { + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -1644,18 +1642,9 @@ out_unlock: } /* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - - async_chunk = container_of(work, struct async_chunk, work); - compress_file_range(async_chunk); -} - -/* - * work queue call back to submit previously compressed pages + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. */ static noinline void async_cow_submit(struct btrfs_work *work) { @@ -1773,7 +1762,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, async_chunk[i].blkcg_css = NULL; } - btrfs_init_work(&async_chunk[i].work, async_cow_start, + btrfs_init_work(&async_chunk[i].work, compress_file_range, async_cow_submit, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); -- cgit v1.2.3 From 00d31d17663c55de99c1fc1224d06009c4799fc1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:35 +0200 Subject: btrfs: merge submit_compressed_extents and async_cow_submit The code in submit_compressed_extents just loops over the async_extents, and doesn't need to be conditional on an inode being present, as there won't be any async_extent in the list if we created and inline extent. Merge the two functions to simplify the logic. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 065d29efc7a0..b119d828cc4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1285,25 +1285,6 @@ out_free: kfree(async_extent); } -/* - * Phase two of compressed writeback. This is the ordered portion of the code, - * which only gets called in the order the work was queued. We walk all the - * async extents created by compress_file_range and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) -{ - struct async_extent *async_extent; - u64 alloc_hint = 0; - - while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - - submit_one_async_extent(async_chunk, async_extent, &alloc_hint); - } -} - static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { @@ -1646,24 +1627,24 @@ out_unlock: * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. */ -static noinline void async_cow_submit(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; unsigned long nr_pages; + u64 alloc_hint = 0; nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * ->inode could be NULL if async_chunk_start has failed to compress, - * in which case we don't have anything to submit, yet we need to - * always adjust ->async_delalloc_pages as its paired with the init - * happening in run_delalloc_compressed - */ - if (async_chunk->inode) - submit_compressed_extents(async_chunk); + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < @@ -1763,7 +1744,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, } btrfs_init_work(&async_chunk[i].work, compress_file_range, - async_cow_submit, async_cow_free); + submit_compressed_extents, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); -- cgit v1.2.3 From e94e54e89b4f9cbbdc2dbbf0c44adae0f4722ab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:36 +0200 Subject: btrfs: streamline compress_file_range Reorder compress_file_range so that the main compression flow happens straight line and not in branches. To do this ensure that pages is always zeroed before a page allocation happens, which allows the cleanup_and_bail_uncompressed label to clean up the page allocations as needed. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 165 +++++++++++++++++++++++++++---------------------------- 1 file changed, 82 insertions(+), 83 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b119d828cc4e..e994582358e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -839,10 +839,11 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages = NULL; + struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; + unsigned int poff; int i; int will_compress; int compress_type = fs_info->compress_type; @@ -865,6 +866,7 @@ static void compress_file_range(struct btrfs_work *work) actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; + pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -908,66 +910,64 @@ again: ret = 0; /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { - WARN_ON(pages); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - nr_pages = 0; - goto cont; - } - - if (inode->defrag_compress) - compress_type = inode->defrag_compress; - else if (inode->prop_compress) - compress_type = inode->prop_compress; + if (!inode_need_compress(inode, start, end)) + goto cont; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { /* - * we need to call clear_page_dirty_for_io on each - * page in the range. Otherwise applications with the file - * mmap'd can wander in and change the page contents while - * we are compressing them. - * - * If the compression fails for any reason, we set the pages - * dirty again later on. - * - * Note that the remaining part is redirtied, the start pointer - * has moved, the end is the original one. + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } - - /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); + nr_pages = 0; + goto cont; + } - if (!ret) { - unsigned long offset = offset_in_page(total_compressed); - struct page *page = pages[nr_pages - 1]; + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) - memzero_page(page, offset, PAGE_SIZE - offset); - will_compress = 1; - } + /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + * + * If the compression fails for any reason, we set the pages dirty again + * later on. + * + * Note that the remaining part is redirtied, the start pointer has + * moved, the end is the original one. + */ + if (!redirty) { + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + redirty = 1; } + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto cont; + + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + will_compress = 1; + cont: /* * Check cow_file_range() for why we don't even try to create inline - * extent for subpage case. + * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ @@ -1026,39 +1026,38 @@ cont: } } - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = ALIGN(total_compressed, blocksize); + if (!will_compress) + goto cleanup_and_bail_uncompressed; - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk, - * compression must free at least one sector size - */ - total_in = round_up(total_in, fs_info->sectorsize); - if (total_compressed + blocksize <= total_in) { - /* - * The async work queues will take care of doing actual - * allocation on disk for these compressed pages, and - * will submit them to the elevator. - */ - add_async_extent(async_chunk, start, total_in, - total_compressed, pages, nr_pages, - compress_type); - - if (start + total_in < end) { - start += total_in; - pages = NULL; - cond_resched(); - goto again; - } - return; - } + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); + + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto cleanup_and_bail_uncompressed; + + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; } + return; + +cleanup_and_bail_uncompressed: if (pages) { /* * the compression code ran but failed to make things smaller, @@ -1079,7 +1078,7 @@ cont: inode->flags |= BTRFS_INODE_NOCOMPRESS; } } -cleanup_and_bail_uncompressed: + /* * No compression, but we still need to write the pages in the file * we've been given so far. redirty the locked page if it corresponds -- cgit v1.2.3 From 6a7167bf9c9258a214bb0526bd3ecbc96db893ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:37 +0200 Subject: btrfs: further simplify the compress or not logic in compress_file_range Currently the logic whether to compress or not in compress_file_range is a bit convoluted because it tries to share code for creating inline extents for the compressible [1] path and the bail to uncompressed path. But the latter isn't needed at all, because cow_file_range as called by submit_uncompressed_range will already create inline extents as needed, so there is no need to have special handling for it if we can live with the fact that it will be called a bit later in the ->ordered_func of the workqueue instead of right now. [1] there is undocumented logic that creates an uncompressed inline extent outside of the shall not compress logic if total_in is too small. This logic isn't explained in comments or any commit log I could find, so I've preserved it. Documentation explaining it would be appreciated if anyone understands this code. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 48 ++++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e994582358e0..8307d2a3083a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -845,7 +845,6 @@ static void compress_file_range(struct btrfs_work *work) unsigned long total_in = 0; unsigned int poff; int i; - int will_compress; int compress_type = fs_info->compress_type; int redirty = 0; @@ -865,7 +864,6 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - will_compress = 0; pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -915,7 +913,7 @@ again: * discover bad compression ratios. */ if (!inode_need_compress(inode, start, end)) - goto cont; + goto cleanup_and_bail_uncompressed; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -923,8 +921,7 @@ again: * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. */ - nr_pages = 0; - goto cont; + goto cleanup_and_bail_uncompressed; } if (inode->defrag_compress) @@ -953,7 +950,7 @@ again: mapping, start, pages, &nr_pages, &total_in, &total_compressed); if (ret) - goto cont; + goto cleanup_and_bail_uncompressed; /* * Zero the tail end of the last page, as we might be sending it down @@ -962,24 +959,22 @@ again: poff = offset_in_page(total_compressed); if (poff) memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); - will_compress = 1; -cont: /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - /* lets try to make an inline extent */ - if (ret || total_in < actual_end) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(inode, actual_end, - 0, BTRFS_COMPRESS_NONE, - NULL, false); + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); } else { - /* try making a compressed inline extent */ ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, @@ -1009,26 +1004,15 @@ cont: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - - /* - * Ensure we only free the compressed pages if we have - * them allocated, as we can still reach here with - * inode_need_compress() == false. - */ - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); } + kfree(pages); return; } } - if (!will_compress) - goto cleanup_and_bail_uncompressed; - /* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things. -- cgit v1.2.3 From 184aa1ffa5fdf77853488c7d7fddb113cf2e33fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:38 +0200 Subject: btrfs: use a separate label for the incompressible case in compress_file_range compress_file_range can fail to compress either because of resource or alignment constraints or because the data is incompressible. In the latter case the inode is marked so that compression isn't tried again. Currently that check is based on the condition that the pages array has been allocated which is rather cryptic. Use a separate label to clearly distinguish this case. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8307d2a3083a..db0dc223c4c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -950,7 +950,7 @@ again: mapping, start, pages, &nr_pages, &total_in, &total_compressed); if (ret) - goto cleanup_and_bail_uncompressed; + goto mark_incompressible; /* * Zero the tail end of the last page, as we might be sending it down @@ -1026,7 +1026,7 @@ again: */ total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) - goto cleanup_and_bail_uncompressed; + goto mark_incompressible; /* * The async work queues will take care of doing actual allocation on @@ -1041,6 +1041,9 @@ again: } return; +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: if (pages) { /* @@ -1055,12 +1058,6 @@ cleanup_and_bail_uncompressed: pages = NULL; total_compressed = 0; nr_pages = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(inode->prop_compress)) { - inode->flags |= BTRFS_INODE_NOCOMPRESS; - } } /* -- cgit v1.2.3 From f778b6b8e0136c0da9ffcf560c8cd8619b1cabf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:39 +0200 Subject: btrfs: share the code to free the page array in compress_file_range compress_file_range has two code blocks to free the page array for the compressed data. Share the code using a goto label. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db0dc223c4c6..cda511d76084 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1004,12 +1004,7 @@ again: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - return; + goto free_pages; } } @@ -1045,21 +1040,6 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - if (pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages = 0; - } - /* * No compression, but we still need to write the pages in the file * we've been given so far. redirty the locked page if it corresponds @@ -1077,6 +1057,14 @@ cleanup_and_bail_uncompressed: extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); +free_pages: + if (pages) { + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + } } static void free_async_extent_pages(struct async_extent *async_extent) -- cgit v1.2.3 From 44962ca37c8cf882ba72e4cd67f30eaecb920a52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:40 +0200 Subject: btrfs: don't redirty pages in compress_file_range compress_file_range needs to clear the dirty bit before handing off work to the compression worker threads to prevent processes coming in through mmap and changing the file contents while the compression is accessing the data (See commit 4adaa611020f ("Btrfs: fix race between mmap writes and compression"). But when compress_file_range decides to not compress the data, it falls back to submit_uncompressed_range which uses extent_write_locked_range to write the uncompressed data. extent_write_locked_range currently expects all pages to be marked dirty so that it can clear the dirty bit itself, and thus compress_file_range has to redirty the page range. Redirtying the page range is rather inefficient and also pointless, so instead pass a pages_dirty parameter to extent_write_locked_range and skip the redirty game entirely. Note that compress_file_range was even redirtying the locked_page twice given that extent_range_clear_dirty_for_io already redirties all pages in the range, which must include locked_page if there is one. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 +++++------------------------ fs/btrfs/extent_io.h | 3 +-- fs/btrfs/inode.c | 43 +++++++++---------------------------------- 3 files changed, 15 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 08fb26d58172..bcf8244d8e1a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -181,22 +181,6 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) } } -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - - while (index <= end_index) { - folio = filemap_get_folio(mapping, index); - filemap_dirty_folio(mapping, folio); - folio_account_redirty(folio); - index += folio_nr_pages(folio); - folio_put(folio); - } -} - static void process_one_page(struct btrfs_fs_info *fs_info, struct page *page, struct page *locked_page, unsigned long page_ops, u64 start, u64 end) @@ -2178,7 +2162,7 @@ retry: * locked. */ void extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc) + struct writeback_control *wbc, bool pages_dirty) { bool found_error = false; int ret = 0; @@ -2204,14 +2188,11 @@ void extent_write_locked_range(struct inode *inode, u64 start, u64 end, int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); - /* - * All pages in the range are locked since - * btrfs_run_delalloc_range(), thus there is no way to clear - * the page dirty flag. - */ ASSERT(PageLocked(page)); - ASSERT(PageDirty(page)); - clear_page_dirty_for_io(page); + if (pages_dirty) { + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + } ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, i_size, &nr); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0312022bbf4b..2678906e87c5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,7 +178,7 @@ int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); void extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc); + struct writeback_control *wbc, bool pages_dirty); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -265,7 +265,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cda511d76084..7447f3b44cd2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -846,10 +846,16 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; - int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); + /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and @@ -929,22 +935,6 @@ again: else if (inode->prop_compress) compress_type = inode->prop_compress; - /* - * We need to call clear_page_dirty_for_io on each page in the range. - * Otherwise applications with the file mmap'd can wander in and change - * the page contents while we are compressing them. - * - * If the compression fails for any reason, we set the pages dirty again - * later on. - * - * Note that the remaining part is redirtied, the start pointer has - * moved, the end is the original one. - */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } - /* Compression level is applied here. */ ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), mapping, start, pages, &nr_pages, &total_in, @@ -1040,21 +1030,6 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in the file - * we've been given so far. redirty the locked page if it corresponds - * to our extent and set things up for the async work queue to run - * cow_file_range to do the normal delalloc dance. - */ - if (async_chunk->locked_page && - (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page)) <= end) { - __set_page_dirty_nobuffers(async_chunk->locked_page); - /* unlocked later on in the async handlers */ - } - - if (redirty) - extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); free_pages: @@ -1130,7 +1105,7 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, /* All pages will be unlocked, including @locked_page */ wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); + extent_write_locked_range(&inode->vfs_inode, start, end, &wbc, false); wbc_detach_inode(&wbc); } @@ -1754,7 +1729,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, } locked_page_done = true; extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc); + wbc, true); start = done_offset + 1; } -- cgit v1.2.3 From 6e144bf16ba07dff649e6bd6afd79d2e353f6216 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:41 +0200 Subject: btrfs: refactor the zoned device handling in cow_file_range Handling of the done_offset to cow_file_range is a bit confusing, as it is not updated at all when the function succeeds, and the -EAGAIN status is used bother for the case where we need to wait for a zone finish and the one where the allocation was partially successful. Change the calling convention so that done_offset is always updated, and 0 is returned if some allocation was successful (partial allocation can still only happen for zoned devices), and waiting for a zone finish is done internally in cow_file_range instead of the caller. Also write a comment explaining the logic. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 58 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7447f3b44cd2..0617e47fb9e3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1363,7 +1363,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * compressed extent. */ unlock_page(locked_page); - return 1; + ret = 1; + goto done; } else if (ret < 0) { goto out_unlock; } @@ -1394,6 +1395,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; @@ -1477,6 +1503,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } +done: + if (done_offset) + *done_offset = end; return ret; out_drop_extent_cache: @@ -1485,21 +1514,6 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: - /* - * If done_offset is non-NULL and ret == -EAGAIN, we expect the - * caller to write out the successfully allocated region and retry. - */ - if (done_offset && ret == -EAGAIN) { - if (orig_start < start) - *done_offset = start - 1; - else - *done_offset = start; - return ret; - } else if (ret == -EAGAIN) { - /* Convert to -ENOSPC since the caller cannot retry. */ - ret = -ENOSPC; - } - /* * Now, we have three regions to clean up: * @@ -1710,19 +1724,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, &done_offset, true, false); - if (ret && ret != -EAGAIN) + if (ret) return ret; - if (ret == 0) - done_offset = end; - - if (done_offset == start) { - wait_on_bit_io(&inode->root->fs_info->flags, - BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - continue; - } - if (!locked_page_done) { __set_page_dirty_nobuffers(locked_page); account_page_redirty(locked_page); -- cgit v1.2.3 From 778b878543f05ddf11843cd44ba9b1e775216fc0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:42 +0200 Subject: btrfs: don't redirty locked_page in run_delalloc_zoned extent_write_locked_range currently expects that either all or no pages are dirty when it is called. Bur run_delalloc_zoned is called directly in the writepages path, and has the dirty bit cleared only for locked_page and which the extent_write_cache_pages currently operates. It currently works around this by redirtying locked_page, but that is a bit inefficient and cumbersome. Pass a locked_page argument to run_delalloc_zoned so that clearing the dirty bit can be skipped on just that page. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++++--- fs/btrfs/extent_io.h | 5 +++-- fs/btrfs/inode.c | 13 ++++--------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bcf8244d8e1a..90a58a38a9f3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2161,8 +2161,9 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc, bool pages_dirty) +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty) { bool found_error = false; int ret = 0; @@ -2189,7 +2190,7 @@ void extent_write_locked_range(struct inode *inode, u64 start, u64 end, page = find_get_page(mapping, cur >> PAGE_SHIFT); ASSERT(PageLocked(page)); - if (pages_dirty) { + if (pages_dirty && page != locked_page) { ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2678906e87c5..c01f9c5ddc13 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -177,8 +177,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc, bool pages_dirty); +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0617e47fb9e3..29a137d55cf2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1105,7 +1105,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, /* All pages will be unlocked, including @locked_page */ wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - extent_write_locked_range(&inode->vfs_inode, start, end, &wbc, false); + extent_write_locked_range(&inode->vfs_inode, NULL, start, end, &wbc, + false); wbc_detach_inode(&wbc); } @@ -1719,7 +1720,6 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, { u64 done_offset = end; int ret; - bool locked_page_done = false; while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, &done_offset, @@ -1727,13 +1727,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, if (ret) return ret; - if (!locked_page_done) { - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - } - locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc, true); + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, true); start = done_offset + 1; } -- cgit v1.2.3 From 256b0cf90d2a052f546bc118784574298c5818f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:43 +0200 Subject: btrfs: fix zoned handling in submit_uncompressed_range For zoned file systems we need to use run_delalloc_zoned to submit writeback, as we need to write out partial allocations when running into zone active limits. submit_uncompressed_range currently always calls cow_file_range to allocate blocks and thus misses the active zone limits handling. Fix this by passing the pages_dirty argument to run_delalloc_zoned and always using it from submit_uncompressed_range as it does the right thing for zoned and non-zoned file systems. To account for the fact that run_delalloc_zoned is now also used for non-zoned file systems rename it to run_delalloc_cow, and add comment describing it. Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems") Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29a137d55cf2..19dc0b8ff427 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,10 +125,10 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); -static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, u64 *done_offset, - bool keep_locked, bool no_inline); +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -1072,18 +1072,9 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, .no_cgroup_owner = 1, }; - /* - * Call cow_file_range() to run the delalloc range directly, since we - * won't go to NOCOW or async path again. - * - * Also we call cow_file_range() with @unlock_page == 0, so that we - * can directly submit them without interruption. - */ - ret = cow_file_range(inode, locked_page, start, end, NULL, true, false); - /* Inline extent inserted, page gets unlocked and everything is done */ - if (ret == 1) - return; - + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { @@ -1100,14 +1091,7 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return; } - - /* All pages will be unlocked, including @locked_page */ - wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - extent_write_locked_range(&inode->vfs_inode, NULL, start, end, &wbc, - false); - wbc_detach_inode(&wbc); } static void submit_one_async_extent(struct async_chunk *async_chunk, @@ -1714,9 +1698,14 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, return true; } -static noinline int run_delalloc_zoned(struct btrfs_inode *inode, - struct page *locked_page, u64 start, - u64 end, struct writeback_control *wbc) +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) { u64 done_offset = end; int ret; @@ -1726,9 +1715,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, true); + done_offset, wbc, pages_dirty); start = done_offset + 1; } @@ -2293,7 +2281,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, wbc); + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); else ret = cow_file_range(inode, locked_page, start, end, NULL, false, false); -- cgit v1.2.3 From ed2da9246f324ae88a2dcae629fc2008632ff151 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2023 17:31:44 +0200 Subject: mm: remove folio_account_redirty Fold folio_account_redirty into folio_redirty_for_writepage now that all other users except for the also unused account_page_redirty wrapper are gone. Reviewed-by: Josef Bacik Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/writeback.h | 5 ----- mm/page-writeback.c | 49 ++++++++++++++--------------------------------- 2 files changed, 14 insertions(+), 40 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fba937999fbf..083387c00f0c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -375,11 +375,6 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); -void folio_account_redirty(struct folio *folio); -static inline void account_page_redirty(struct page *page) -{ - folio_account_redirty(page_folio(page)); -} bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d3f42009bb70..b8d3d7040a50 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1193,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, * write_bandwidth = --------------------------------------------------- * period * - * @written may have decreased due to folio_account_redirty(). + * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); @@ -2711,37 +2711,6 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) } EXPORT_SYMBOL(filemap_dirty_folio); -/** - * folio_account_redirty - Manually account for redirtying a page. - * @folio: The folio which is being redirtied. - * - * Most filesystems should call folio_redirty_for_writepage() instead - * of this fuction. If your filesystem is doing writeback outside the - * context of a writeback_control(), it can call this when redirtying - * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED, - * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN, - * WB_WRITTEN) in long term. The mismatches will lead to systematic errors - * in balanced_dirty_ratelimit and the dirty pages position control. - */ -void folio_account_redirty(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - - if (mapping && mapping_can_writeback(mapping)) { - struct inode *inode = mapping->host; - struct bdi_writeback *wb; - struct wb_lock_cookie cookie = {}; - long nr = folio_nr_pages(folio); - - wb = unlocked_inode_to_wb_begin(inode, &cookie); - current->nr_dirtied -= nr; - node_stat_mod_folio(folio, NR_DIRTIED, -nr); - wb_stat_mod(wb, WB_DIRTIED, -nr); - unlocked_inode_to_wb_end(inode, &cookie); - } -} -EXPORT_SYMBOL(folio_account_redirty); - /** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. @@ -2757,13 +2726,23 @@ EXPORT_SYMBOL(folio_account_redirty); bool folio_redirty_for_writepage(struct writeback_control *wbc, struct folio *folio) { - bool ret; + struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); + bool ret; wbc->pages_skipped += nr; - ret = filemap_dirty_folio(folio->mapping, folio); - folio_account_redirty(folio); + ret = filemap_dirty_folio(mapping, folio); + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + wb = unlocked_inode_to_wb_begin(inode, &cookie); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } return ret; } EXPORT_SYMBOL(folio_redirty_for_writepage); -- cgit v1.2.3 From 7f9879eb6062754a126a874f784f8fa571e767ee Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 27 Jul 2023 21:53:03 +0800 Subject: btrfs: print name and pid when device scanning processes race There is a race between systemd and mount, as both of them try to register the device in the kernel. When systemd loses the race, it prints the following message: BTRFS error: device /dev/sdb7 belongs to fsid 1b3bacbf-14db-49c9-a3ef-547998aacc4e, and the fs is already mounted. The 'btrfs dev scan' registers one device at a time, so there is no way for the mount thread to wait in the kernel for all the devices to have registered as it won't know if all the devices are discovered. For now, improve the error log by printing the command name and process ID along with the error message. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8af0e54099c2..89410a480293 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -851,8 +851,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, - "device %s belongs to fsid %pU, and the fs is already mounted", - path, fs_devices->fsid); +"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, fs_devices->fsid, current->comm, + task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } -- cgit v1.2.3 From b9d97cff256f0efafd00f76b8d10d668f64436e9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Jul 2023 16:12:15 -0400 Subject: btrfs: move comments to btrfs_loop_type definition Some of these loop types aren't described, and they should be with the definitions to make it easier to tell what each of them do. Reviewed-by: Boris Burkov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d674e421ea75..6dfe086523f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3329,11 +3329,38 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ LOOP_NO_EMPTY_SIZE, }; @@ -3927,15 +3954,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_UNSET_SIZE_CLASS, allow unset size class - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ + /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* -- cgit v1.2.3 From 257deed2a955fd73cc1ef53a96583edeab3ece1a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:27 +0800 Subject: btrfs: tests: enhance extent buffer bitmap tests Enhance extent bitmap tests for the following aspects: - Remove unnecessary @len from __test_eb_bitmaps() We can fetch the length from extent buffer - Explicitly distinguish bit and byte length Now every start/len inside bitmap tests would have either "byte_" or "bit_" prefix to make it more explicit. - Better error reporting If we have mismatch bits, the error report would dump the following contents: * start bytenr * bit number * the full byte from bitmap * the full byte from the extent This is to save developers time so obvious problem can be found immediately - Extract bitmap set/clear and check operation into two helpers This is to save some code lines, as we will have more tests to do. - Add new tests The following tests are added, mostly for the incoming extent bitmap accessor refactoring: * Set bits inside the same byte * Clear bits inside the same byte * Cross byte boundary set * Cross byte boundary clear * Cross multi-byte boundary set * Cross multi-byte boundary clear Those new tests have already saved my backend for the incoming extent buffer bitmap refactoring. Reviewed-by: Sweet Tea Dorminy Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/extent-io-tests.c | 158 ++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f6bc6d738555..3e625c558b0b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -319,86 +319,139 @@ out: return ret; } -static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i; - for (i = 0; i < len * BITS_PER_BYTE; i++) { + for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { int bit, bit1; bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_err("bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x", + i, i / BITS_PER_BYTE, has, expect); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_err("offset bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x", + i / BITS_PER_BYTE, i % BITS_PER_BYTE, + i / BITS_PER_BYTE, has, expect); return -EINVAL; } } return 0; } -static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int test_bitmap_set(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} + +static int test_bitmap_clear(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i, j; + unsigned long byte_len = eb->len; u32 x; int ret; - memset(bitmap, 0, len); - memzero_extent_buffer(eb, 0, len); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_err("bitmap was not zeroed"); - return -EINVAL; - } + ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting all bits failed"); + ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing all bits failed"); + ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20); + if (ret < 0) return ret; - } /* Straddling pages test */ - if (len > PAGE_SIZE) { - bitmap_set(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting straddling pages failed"); + if (byte_len > PAGE_SIZE) { + ret = test_bitmap_set("cross page set", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross page clear", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing straddling pages failed"); + if (ret < 0) return ret; - } } /* @@ -406,9 +459,12 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) { x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; for (j = 0; j < 32; j++) { if (x & (1U << j)) { @@ -418,7 +474,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, } } - ret = check_eb_bitmap(bitmap, eb, len); + ret = check_eb_bitmap(bitmap, eb); if (ret) { test_err("random bit pattern failed"); return ret; @@ -456,7 +512,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); if (ret) goto out; @@ -473,7 +529,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); kfree(bitmap); -- cgit v1.2.3 From 5864f1da6b165fc4735f02140b04fbfcb6ff27bf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:28 +0800 Subject: btrfs: tests: add self tests for extent buffer memory operations The new self tests would populate a memory range with random bytes, then copy it to the extent buffer, so that we can verify if the extent buffer memory operation and memmove()/memcopy() are resulting the same contents. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/extent-io-tests.c | 144 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 3e625c558b0b..1cc86af97dc6 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -648,6 +648,146 @@ out: return ret; } +static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < eb->len; i++) { + struct page *page = eb->pages[i >> PAGE_SHIFT]; + void *addr = page_address(page) + offset_in_page(i); + + if (memcmp(addr, memory + i, 1) != 0) { + test_err("%s failed", test_name); + test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x", + i, *(u8 *)addr, *(u8 *)(memory + i)); + return; + } + } +} + +static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { + void *eb_addr = page_address(eb->pages[i]); + + if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { + dump_eb_and_memory_contents(eb, memory, test_name); + return -EUCLEAN; + } + } + return 0; +} + +/* + * Init both memory and extent buffer contents to the same randomly generated + * contents. + */ +static void init_eb_and_memory(struct extent_buffer *eb, void *memory) +{ + get_random_bytes(memory, eb->len); + write_extent_buffer(eb, memory, 0, eb->len); +} + +static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct extent_buffer *eb = NULL; + void *memory = NULL; + int ret; + + test_msg("running extent buffer memory operation tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + memory = kvzalloc(nodesize, GFP_KERNEL); + if (!memory) { + test_err("failed to allocate memory"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + + init_eb_and_memory(eb, memory); + ret = verify_eb_and_memory(eb, memory, "full eb write"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 16, 16); + memcpy_extent_buffer(eb, 0, 16, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 2048, 16); + memcpy_extent_buffer(eb, 0, 2048, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + memcpy(memory, memory + 2048, 2048); + memcpy_extent_buffer(eb, 0, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3"); + if (ret < 0) + goto out; + + memmove(memory + 512, memory + 256, 512); + memmove_extent_buffer(eb, 512, 256, 512); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 2048, memory + 512, 2048); + memmove_extent_buffer(eb, 2048, 512, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2"); + if (ret < 0) + goto out; + memmove(memory + 512, memory + 2048, 2048); + memmove_extent_buffer(eb, 512, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3"); + if (ret < 0) + goto out; + + if (nodesize > PAGE_SIZE) { + memcpy(memory, memory + 4096 - 128, 256); + memcpy_extent_buffer(eb, 0, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory + 4096 - 128, memory + 4096 + 128, 256); + memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 128, memory + 4096 - 64, 256); + memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 64, memory + 4096 - 128, 256); + memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2"); + if (ret < 0) + goto out; + } +out: + free_extent_buffer(eb); + kvfree(memory); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -663,6 +803,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) goto out; ret = test_eb_bitmaps(sectorsize, nodesize); + if (ret) + goto out; + + ret = test_eb_mem_ops(sectorsize, nodesize); out: return ret; } -- cgit v1.2.3 From cb22964f1dad228fc2a1e5aba50c16b4b5680cd5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:29 +0800 Subject: btrfs: refactor extent buffer bitmaps operations [BACKGROUND] Currently we handle extent bitmaps manually in extent_buffer_bitmap_set() and extent_buffer_bitmap_clear(). Although with various helpers like eb_bitmap_offset() it's still a little messy to read. The code seems to be a copy of bitmap_set(), but with all the cross-page handling embedded into the code. [ENHANCEMENT] This patch would enhance the readability by introducing two helpers: - memset_extent_buffer() To handle the byte aligned range, thus all the cross-page handling is done there. - extent_buffer_get_byte() This for the first and the last byte operations, which only need to grab one byte, thus no need for any cross-page handling. So we can split both extent_buffer_bitmap_set() and extent_buffer_bitmap_clear() into 3 parts: - Handle the first byte If the range fits inside the first byte, we can exit early. - Handle the byte aligned part This is the part which can have cross-page operations, and it would be handled by memset_extent_buffer(). - Handle the last byte This refactoring does not only make the code a little easier to read, but also makes later folio/page switch much easier, as the switch only needs to be done inside memset_extent_buffer() and extent_buffer_get_byte(). Reviewed-by: Sweet Tea Dorminy Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 137 ++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 72 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90a58a38a9f3..3e9f39bda9f2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4183,34 +4183,32 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, } } -void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, - unsigned long len) +static void memset_extent_buffer(const struct extent_buffer *eb, int c, + unsigned long start, unsigned long len) { - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - unsigned long i = get_eb_page_index(start); + unsigned long cur = start; - if (check_eb_range(eb, start, len)) - return; - - offset = get_eb_offset_in_page(eb, start); + while (cur < start + len) { + unsigned long index = get_eb_page_index(cur); + unsigned int offset = get_eb_offset_in_page(eb, cur); + unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); + struct page *page = eb->pages[index]; - while (len > 0) { - page = eb->pages[i]; assert_eb_page_uptodate(eb, page); + memset(page_address(page) + offset, c, cur_len); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); - memset(kaddr + offset, 0, cur); - - len -= cur; - offset = 0; - i++; + cur += cur_len; } } +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + if (check_eb_range(eb, start, len)) + return; + return memset_extent_buffer(eb, 0, start, len); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { @@ -4325,6 +4323,15 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } +static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) +{ + unsigned long index = get_eb_page_index(bytenr); + + if (check_eb_range(eb, bytenr, 1)) + return NULL; + return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); +} + /* * Set an area of a bitmap to 1. * @@ -4336,35 +4343,28 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_set) { - kaddr[offset] |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] |= mask_to_set; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr |= mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); } @@ -4380,35 +4380,28 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_clear) { - kaddr[offset] &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] &= ~mask_to_clear; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr &= ~mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) -- cgit v1.2.3 From 730c374e5b2cec061e3af7616b9d1265e60b6c0a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:30 +0800 Subject: btrfs: use write_extent_buffer() to implement write_extent_buffer_*id() Helpers write_extent_buffer_chunk_tree_uuid() and write_extent_buffer_fsid(), they can be implemented by write_extent_buffer(). These two helpers are not that frequently used, they only get called during initialization of a new tree block. There is not much need for those slightly optimized versions. And since they can be easily converted to one write_extent_buffer() call, define them as inline helpers. This would make later page/folio switch much easier, as all change only need to happen in write_extent_buffer(). Reviewed-by: Sweet Tea Dorminy Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 22 ---------------------- fs/btrfs/extent_io.h | 19 ++++++++++++++++--- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e9f39bda9f2..dc15eaa75c65 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4129,28 +4129,6 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, } } -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, - chunk_tree_uuid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c01f9c5ddc13..adda14c1b763 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -236,11 +236,24 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *src); void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); + +static inline void write_extent_buffer_chunk_tree_uuid( + const struct extent_buffer *eb, const void *chunk_tree_uuid) +{ + write_extent_buffer(eb, chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_FSID_SIZE); +} + +static inline void write_extent_buffer_fsid(const struct extent_buffer *eb, + const void *fsid) +{ + write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src); void copy_extent_buffer(const struct extent_buffer *dst, -- cgit v1.2.3 From 54948681c21171a67c2fe03adcce54901e39dfe3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:31 +0800 Subject: btrfs: refactor main loop in copy_extent_buffer_full() [BACKGROUND] copy_extent_buffer_full() currently does different handling for regular and subpage cases, for regular cases it does a page by page copying. For subpage cases, it just copies the content. This is fine for the page based extent buffer code, but for the incoming folio conversion, it can be a burden to add a new branch just to handle all the different combinations (subpage vs regular, one single folio vs multi pages). [ENHANCE] Instead of handling the different combinations, just go one single handling for all cases, utilizing write_extent_buffer() to do the copying. Reviewed-by: Sweet Tea Dorminy Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dc15eaa75c65..1ba8857a8dbe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4190,24 +4190,19 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - int i; - int num_pages; + unsigned long cur = 0; ASSERT(dst->len == src->len); - if (dst->fs_info->nodesize >= PAGE_SIZE) { - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); - } else { - size_t src_offset = get_eb_offset_in_page(src, 0); - size_t dst_offset = get_eb_offset_in_page(dst, 0); + while (cur < src->len) { + unsigned long index = get_eb_page_index(cur); + unsigned long offset = get_eb_offset_in_page(src, cur); + unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + void *addr = page_address(src->pages[index]) + offset; + + write_extent_buffer(dst, addr, cur, cur_len); - ASSERT(src->fs_info->nodesize < PAGE_SIZE); - memcpy(page_address(dst->pages[0]) + dst_offset, - page_address(src->pages[0]) + src_offset, - src->len); + cur += cur_len; } } -- cgit v1.2.3 From 682a0bc5573f3c10a435fb9650c856bb325d2733 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:32 +0800 Subject: btrfs: copy all pages at once at the end of btrfs_clone_extent_buffer() btrfs_clone_extent_buffer() calls copy_page() at each iteration but we can copy all pages at the end in one go if there were no errors. This would make later conversion to folios easier. Reviewed-by: Sweet Tea Dorminy Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ba8857a8dbe..3e30333a4c6b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3239,8 +3239,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - copy_page(page_address(p), page_address(src->pages[i])); } + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; -- cgit v1.2.3 From 13840f3f2837b9a00f06d2d8d93dbdc6b2b6532e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:33 +0800 Subject: btrfs: refactor main loop in memcpy_extent_buffer() [BACKGROUND] Currently memcpy_extent_buffer() does a loop where it would stop at any page boundary inside [dst_offset, dst_offset + len) or [src_offset, src_offset + len). This is mostly allowing us to do copy_pages(), but if we're going to use folios we will need to handle multi-page (the old behavior) or single folio (the new optimization). The current code would be a burden for future changes. [ENHANCEMENT] There is a hidden pitfall of the naming memcpy_extent_buffer(), unlike regular memcpy(), this function can handle overlapping ranges. So here we extract write_extent_buffer() into a new internal helper, __write_extent_buffer(), and add a new parameter @use_memmove, to indicate whether we should use memmove() or regular memcpy(). Now we can go __write_extent_buffer() to handle writing into the dst range, with proper overlapping detection. This has a tiny change to the chance of calling memmove(). As the split only happens at the source range page boundaries, the memcpy/memmove() range would be slightly larger than the old code, thus slightly increase the chance we call memmove() other than memcopy(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 58 ++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e30333a4c6b..0c4a305aa5cb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4129,8 +4129,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, } } -void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) +static void __write_extent_buffer(const struct extent_buffer *eb, + const void *srcv, unsigned long start, + unsigned long len, bool use_memmove) { size_t cur; size_t offset; @@ -4138,6 +4139,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *kaddr; char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + /* For unmapped (dummy) ebs, no need to check their uptodate status. */ + const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); @@ -4148,11 +4151,15 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); + if (check_uptodate) + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memcpy(kaddr + offset, src, cur); + if (use_memmove) + memmove(kaddr + offset, src, cur); + else + memcpy(kaddr + offset, src, cur); src += cur; len -= cur; @@ -4161,6 +4168,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, } } +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + return __write_extent_buffer(eb, srcv, start, len, false); +} + static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { @@ -4409,34 +4422,25 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_i; - unsigned long src_i; + unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; - while (len > 0) { - dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); - src_off_in_page = get_eb_offset_in_page(dst, src_offset); - - dst_i = get_eb_page_index(dst_offset); - src_i = get_eb_page_index(src_offset); - - cur = min(len, (unsigned long)(PAGE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_SIZE - dst_off_in_page)); - - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; + while (cur_off < len) { + unsigned long cur_src = cur_off + src_offset; + unsigned long pg_index = get_eb_page_index(cur_src); + unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long cur_len = min(src_offset + len - cur_src, + PAGE_SIZE - pg_off); + void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + const bool use_memmove = areas_overlap(src_offset + cur_off, + dst_offset + cur_off, cur_len); + + __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, + use_memmove); + cur_off += cur_len; } } -- cgit v1.2.3 From 096d230165439a341639e85a5507c6b74c7dd410 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 15 Jul 2023 19:08:34 +0800 Subject: btrfs: refactor main loop in memmove_extent_buffer() [BACKGROUND] Currently memove_extent_buffer() does a loop where it strop at any page boundary inside [dst_offset, dst_offset + len) or [src_offset, src_offset + len). This is mostly allowing us to do copy_pages(), but if we're going to use folios we will need to handle multi-page (the old behavior) or single folio (the new optimization). The current code would be a burden for future changes. [ENHANCEMENT] Instead of sticking with copy_pages(), here we utilize the new __write_extent_buffer() helper to handle the writes. Unlike the refactoring in memcpy_extent_buffer(), we can not just rely on the write_extent_buffer() and only handle page boundaries inside src range. The function write_extent_buffer() itself is still doing forward writing, thus it cannot handle the following case: (already in the extent buffer memory operation tests, cross page overlapping run 2) Src Page boundary |///////| |///|////| Dst In the above case, if we just follow page boundary in the src range, we have no need to do any split, just one __write_extent_buffer() with use_memmove = true. But __write_extent_buffer() would split the dst range into two, so it first copies the beginning part of the src range into the first half of the dst range. After this operation, the beginning of the dst range is already updated, causing corruption. So we have to follow the old behavior of handling both page boundaries. And since we're the last caller of copy_pages(), we can remove it completely. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 48 +++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0c4a305aa5cb..36d80cb8cfa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4396,28 +4396,6 @@ static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned return distance < len; } -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - char *src_kaddr; - int must_memmove = 0; - - if (dst_page != src_page) { - src_kaddr = page_address(src_page); - } else { - src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; - } - - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -} - void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) @@ -4448,23 +4426,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - unsigned long dst_i; - unsigned long src_i; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } + while (len > 0) { - dst_i = get_eb_page_index(dst_end); + unsigned long src_i; + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + void *src_addr; + bool use_memmove; + src_i = get_eb_page_index(src_end); dst_off_in_page = get_eb_offset_in_page(dst, dst_end); @@ -4472,9 +4453,14 @@ void memmove_extent_buffer(const struct extent_buffer *dst, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); + + src_addr = page_address(dst->pages[src_i]) + src_off_in_page - + cur + 1; + use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, + cur); + + __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, + use_memmove); dst_end -= cur; src_end -= cur; -- cgit v1.2.3 From 4490e803e1fe9fab8db5025e44e23b55df54078b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:56:57 +0100 Subject: btrfs: don't start transaction when joining with TRANS_JOIN_NOSTART When joining a transaction with TRANS_JOIN_NOSTART, if we don't find a running transaction we end up creating one. This goes against the purpose of TRANS_JOIN_NOSTART which is to join a running transaction if its state is at or below the state TRANS_STATE_COMMIT_START, otherwise return an -ENOENT error and don't start a new transaction. So fix this to not create a new transaction if there's no running transaction at or below that state. CC: stable@vger.kernel.org # 4.14+ Fixes: a6d155d2e363 ("Btrfs: fix deadlock between fiemap and transaction commits") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 815f61d6b506..6a2a12593183 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,10 +292,11 @@ loop: spin_unlock(&fs_info->trans_lock); /* - * If we are ATTACH, we just want to catch the current transaction, - * and commit it. If there is no transaction, just return ENOENT. + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. */ - if (type == TRANS_ATTACH) + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* -- cgit v1.2.3 From 19288951ffa8c67c9f40d2eed72ebc53071227c0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:56:58 +0100 Subject: btrfs: update comment for btrfs_join_transaction_nostart() Update the comment for btrfs_join_transaction_nostart() to be more clear about how it works and how it's different from btrfs_attach_transaction(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6a2a12593183..ab09542f2170 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -799,7 +799,10 @@ struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root * /* * Similar to regular join but it never starts a transaction when none is - * running or after waiting for the current one to finish. + * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. + * This is similar to btrfs_attach_transaction() but it allows the join to + * happen if the transaction commit already started but it's not yet in the + * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { -- cgit v1.2.3 From 4d2024e90d0e06a1745d22a9d58fff2f6726481a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:56:59 +0100 Subject: btrfs: print target number of bytes when dumping free space When dumping free space, with btrfs_dump_free_space(), we pass a bytes argument in order to count how many free space entries in the block group have a size greater than or equal to that number of bytes. We then print how many suitable entries we found, but we don't print the target number of bytes, we just say "bytes". Change the message to actually print the number of bytes, which makes debugging -ENOSPC issues a bit easier. Also sligthly change the odd grammar and terminology: the sentence is ending with 'is', which doesn't make sense, and the term 'blocks' is confusing as we are referring to free space entries within the block group's free space cache. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bfc01352351c..cd5bfda2c259 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2943,7 +2943,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, - "%d blocks of free space at or bigger than bytes is", count); + "%d free space entries at or bigger than %llu bytes", + count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, -- cgit v1.2.3 From b92e8f5472a28e311983f9f47e281e0adf56f10a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:00 +0100 Subject: btrfs: print block group super and delalloc bytes when dumping space info When dumping a space info's block groups, also print the number of bytes used for super blocks and delalloc. This is often useful for debugging -ENOSPC problems. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 75e7fa337e66..ae12a8a9cd12 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -525,10 +525,11 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", - cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->zone_unusable, - cache->ro ? "[readonly]" : ""); +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->delalloc_bytes, + cache->bytes_super, cache->zone_unusable, + cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } -- cgit v1.2.3 From e50b122b832bed9f36d459f19fd73d00658f5c1e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:01 +0100 Subject: btrfs: print available space for a block group when dumping a space info When dumping a space info, we iterate over all its block groups and then print their size and the amounts of bytes used, reserved, pinned, etc. When debugging -ENOSPC problems it's also useful to know how much space is available (free), so calculate that and print it as well. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ae12a8a9cd12..54d78e839c01 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -523,13 +523,18 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); again: list_for_each_entry(cache, &info->block_groups[index], list) { + u64 avail; + spin_lock(&cache->lock); + avail = cache->length - cache->used - cache->pinned - + cache->reserved - cache->delalloc_bytes - + cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, -"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable %s", +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->delalloc_bytes, cache->bytes_super, cache->zone_unusable, - cache->ro ? "[readonly]" : ""); + avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } -- cgit v1.2.3 From 1ff9fee3bd2e865c81d29684dd40a91698f7999b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:02 +0100 Subject: btrfs: print available space across all block groups when dumping space info When dumping a space info also sum the available space for all block groups and then print it. This often useful for debugging -ENOSPC related problems. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 54d78e839c01..de0044283e29 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -510,6 +510,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, int dump_block_groups) { struct btrfs_block_group *cache; + u64 total_avail = 0; int index = 0; spin_lock(&info->lock); @@ -537,10 +538,13 @@ again: avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); + total_avail += avail; } if (++index < BTRFS_NR_RAID_TYPES) goto again; up_read(&info->groups_sem); + + btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); } static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 1b6948acb854b88162f273a26f9324d167051700 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:03 +0100 Subject: btrfs: don't steal space from global rsv after a transaction abort When doing a priority metadata space reclaim, while we are going through the flush states and running their respective operations, it's possible that a transaction abort happened, for example when running delayed refs we hit -ENOSPC or in the critical section of transaction commit we failed with -ENOSPC or some other error. In these cases a transaction was aborted and the fs turned into error state. If that happened, then it makes no sense to steal from the global block reserve and return success to the caller if the stealing was successful - the caller will later get an error when attempting to modify the fs. Instead make the ticket fail if we have the fs in error state and don't attempt to steal from the global rsv, as it's not only it's pointless, it also simplifies debugging some -ENOSPC problems. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index de0044283e29..5b1b71e029ad 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1418,8 +1418,18 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } } - /* Attempt to steal from the global rsv if we can. */ - if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + /* + * Attempt to steal from the global rsv if we can, except if the fs was + * turned into error mode due to a transaction abort when flushing space + * above, in that case fail with -EROFS instead of returning success to + * the caller if we can steal from the global rsv - this is just to have + * caller fail immeditelly instead of later when trying to modify the + * fs, making it easier to debug -ENOSPC problems. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ticket->error = -EROFS; + remove_ticket(space_info, ticket); + } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } -- cgit v1.2.3 From ae3364e5215bed9ce89db6b0c2d21eae4b66f4ae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:04 +0100 Subject: btrfs: store the error that turned the fs into error state Currently when we turn the fs into an error state, typically after a transaction abort, we don't store the error anywhere, we just set a bit (BTRFS_FS_STATE_ERROR) at struct btrfs_fs_info::fs_state to signal the error state. There are cases where it would be useful to have access to the specific error in order to provide a more meaningful error to users/applications. This change adds a member to struct btrfs_fs_info to store the error and removes the BTRFS_FS_STATE_ERROR bit. When there's no error, the new member (fs_error) has a value of 0, otherwise its value is a negative errno value. Followup changes will make use of this new member. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/fs.h | 12 ++++++++---- fs/btrfs/messages.c | 10 +++++++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7a096a8c0f6..ccc34c67b040 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3222,7 +3222,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, -EUCLEAN); /* * In the long term, we'll store the compression type in the super diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 203d2a267828..ef07c6c252d8 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -46,8 +46,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); * Runtime (in-memory) states of filesystem */ enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, /* * Filesystem is being remounted, allow to skip some operations, like * defrag @@ -686,6 +684,12 @@ struct btrfs_fs_info { bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + /* + * If this is not 0, then it indicates a serious filesystem error has + * happened and it contains that error (negative errno value). + */ + int fs_error; + /* Filesystem state */ unsigned long fs_state; @@ -962,8 +966,8 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) +#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) + #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 23fc11af498a..e3c9d2706341 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -10,14 +10,13 @@ #ifdef CONFIG_PRINTK #define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', @@ -37,6 +36,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; + if (BTRFS_FS_ERROR(info)) { + *curr++ = 'E'; + states_printed = true; + } + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { @@ -155,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, errno); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) -- cgit v1.2.3 From a7f8de500e28bb227e02a7bd35988cf37b816c86 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:05 +0100 Subject: btrfs: return real error when orphan cleanup fails due to a transaction abort During mount we will call btrfs_orphan_cleanup() to remove any inodes that were previously deleted (have a link count of 0) but for which we were not able before to remove their items from the subvolume tree. The removal of the items will happen by triggering eviction, when we do the final iput() on them at btrfs_orphan_cleanup(), which will end in the loop at btrfs_evict_inode() that truncates inode items. In a dire situation we may have a transaction abort due to -ENOSPC when attempting to truncate the inode items, and in that case the orphan item (key type BTRFS_ORPHAN_ITEM_KEY) will remain in the subvolume tree and when we hit the next iteration of the while loop at btrfs_orphan_cleanup() we will find the same orphan item as before, and then we will return -EINVAL from btrfs_orphan_cleanup() through the following if statement: if (found_key.offset == last_objectid) { btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); ret = -EINVAL; goto out; } This makes the mount operation fail with -EINVAL, when it should have been -ENOSPC. This is confusing because -EINVAL might lead a user into thinking it provided invalid mount options for example. An example where this happens: $ mount test.img /mnt mount: /mnt: wrong fs type, bad option, bad superblock on /dev/loop0, missing codepage or helper program, or other error. $ dmesg [ 2542.356934] BTRFS: device fsid 977fff75-1181-4d2b-a739-384fa710d16e devid 1 transid 47409973 /dev/loop0 scanned by mount (4459) [ 2542.357451] BTRFS info (device loop0): using crc32c (crc32c-intel) checksum algorithm [ 2542.357461] BTRFS info (device loop0): disk space caching is enabled [ 2542.742287] BTRFS info (device loop0): auto enabling async discard [ 2542.764554] BTRFS info (device loop0): checking UUID tree [ 2551.743065] ------------[ cut here ]------------ [ 2551.743068] BTRFS: Transaction aborted (error -28) [ 2551.743149] WARNING: CPU: 7 PID: 215 at fs/btrfs/block-group.c:3494 btrfs_write_dirty_block_groups+0x397/0x3d0 [btrfs] [ 2551.743311] Modules linked in: btrfs blake2b_generic (...) [ 2551.743353] CPU: 7 PID: 215 Comm: kworker/u24:5 Not tainted 6.4.0-rc6-btrfs-next-134+ #1 [ 2551.743356] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 2551.743357] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] [ 2551.743405] RIP: 0010:btrfs_write_dirty_block_groups+0x397/0x3d0 [btrfs] [ 2551.743449] Code: 8b 43 0c (...) [ 2551.743451] RSP: 0018:ffff982c005a7c40 EFLAGS: 00010286 [ 2551.743452] RAX: 0000000000000000 RBX: ffff88fc6e44b400 RCX: 0000000000000000 [ 2551.743453] RDX: 0000000000000002 RSI: ffffffff8dff0878 RDI: 00000000ffffffff [ 2551.743454] RBP: ffff88fc51817208 R08: 0000000000000000 R09: ffff982c005a7ae0 [ 2551.743455] R10: 0000000000000001 R11: 0000000000000001 R12: ffff88fc43d2e570 [ 2551.743456] R13: ffff88fc43d2e400 R14: ffff88fc8fb08ee0 R15: ffff88fc6e44b530 [ 2551.743457] FS: 0000000000000000(0000) GS:ffff89035fbc0000(0000) knlGS:0000000000000000 [ 2551.743458] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2551.743459] CR2: 00007fa8cdf2f6f4 CR3: 0000000124850003 CR4: 0000000000370ee0 [ 2551.743462] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2551.743463] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2551.743464] Call Trace: [ 2551.743472] [ 2551.743474] ? __warn+0x80/0x130 [ 2551.743478] ? btrfs_write_dirty_block_groups+0x397/0x3d0 [btrfs] [ 2551.743520] ? report_bug+0x1f4/0x200 [ 2551.743523] ? handle_bug+0x42/0x70 [ 2551.743526] ? exc_invalid_op+0x14/0x70 [ 2551.743528] ? asm_exc_invalid_op+0x16/0x20 [ 2551.743532] ? btrfs_write_dirty_block_groups+0x397/0x3d0 [btrfs] [ 2551.743574] ? _raw_spin_unlock+0x15/0x30 [ 2551.743576] ? btrfs_run_delayed_refs+0x1bd/0x200 [btrfs] [ 2551.743609] commit_cowonly_roots+0x1e9/0x260 [btrfs] [ 2551.743652] btrfs_commit_transaction+0x42e/0xfa0 [btrfs] [ 2551.743693] ? __pfx_autoremove_wake_function+0x10/0x10 [ 2551.743697] flush_space+0xf1/0x5d0 [btrfs] [ 2551.743743] ? _raw_spin_unlock+0x15/0x30 [ 2551.743745] ? finish_task_switch+0x91/0x2a0 [ 2551.743748] ? _raw_spin_unlock+0x15/0x30 [ 2551.743750] ? btrfs_get_alloc_profile+0xc9/0x1f0 [btrfs] [ 2551.743793] btrfs_async_reclaim_metadata_space+0xe1/0x230 [btrfs] [ 2551.743837] process_one_work+0x1d9/0x3e0 [ 2551.743844] worker_thread+0x4a/0x3b0 [ 2551.743847] ? __pfx_worker_thread+0x10/0x10 [ 2551.743849] kthread+0xee/0x120 [ 2551.743852] ? __pfx_kthread+0x10/0x10 [ 2551.743854] ret_from_fork+0x29/0x50 [ 2551.743860] [ 2551.743861] ---[ end trace 0000000000000000 ]--- [ 2551.743863] BTRFS info (device loop0: state A): dumping space info: [ 2551.743866] BTRFS info (device loop0: state A): space_info DATA has 126976 free, is full [ 2551.743868] BTRFS info (device loop0: state A): space_info total=13458472960, used=13458137088, pinned=143360, reserved=0, may_use=0, readonly=65536 zone_unusable=0 [ 2551.743870] BTRFS info (device loop0: state A): space_info METADATA has -51625984 free, is full [ 2551.743872] BTRFS info (device loop0: state A): space_info total=771751936, used=770146304, pinned=1605632, reserved=0, may_use=51625984, readonly=0 zone_unusable=0 [ 2551.743874] BTRFS info (device loop0: state A): space_info SYSTEM has 14663680 free, is not full [ 2551.743875] BTRFS info (device loop0: state A): space_info total=14680064, used=16384, pinned=0, reserved=0, may_use=0, readonly=0 zone_unusable=0 [ 2551.743877] BTRFS info (device loop0: state A): global_block_rsv: size 53231616 reserved 51544064 [ 2551.743878] BTRFS info (device loop0: state A): trans_block_rsv: size 0 reserved 0 [ 2551.743879] BTRFS info (device loop0: state A): chunk_block_rsv: size 0 reserved 0 [ 2551.743880] BTRFS info (device loop0: state A): delayed_block_rsv: size 0 reserved 0 [ 2551.743881] BTRFS info (device loop0: state A): delayed_refs_rsv: size 786432 reserved 0 [ 2551.743886] BTRFS: error (device loop0: state A) in btrfs_write_dirty_block_groups:3494: errno=-28 No space left [ 2551.743911] BTRFS info (device loop0: state EA): forced readonly [ 2551.743951] BTRFS warning (device loop0: state EA): could not allocate space for delete; will truncate on mount [ 2551.743962] BTRFS error (device loop0: state EA): Error removing orphan entry, stopping orphan cleanup [ 2551.743973] BTRFS warning (device loop0: state EA): Skipping commit of aborted transaction. [ 2551.743989] BTRFS error (device loop0: state EA): could not do orphan cleanup -22 So make the btrfs_orphan_cleanup() return the value of BTRFS_FS_ERROR(), if it's set, and -EINVAL otherwise. For that same example, after this change, the mount operation fails with -ENOSPC: $ mount test.img /mnt mount: /mnt: mount(2) system call failed: No space left on device. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 19dc0b8ff427..431007e443a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3509,9 +3509,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); - ret = -EINVAL; + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } -- cgit v1.2.3 From 7e3bfd146e3ed1e3125a1c5e53c12dee787b6b4b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:06 +0100 Subject: btrfs: fail priority metadata ticket with real fs error At priority_reclaim_metadata_space(), if we were not able to satisfy the the ticket after going through the various flushing states and we notice the fs went into an error state, likely due to a transaction abort during the flushing, set the ticket's error to the error that caused the transaction abort instead of an unconditional -EROFS. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 5b1b71e029ad..be5ce209b918 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1421,13 +1421,13 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, /* * Attempt to steal from the global rsv if we can, except if the fs was * turned into error mode due to a transaction abort when flushing space - * above, in that case fail with -EROFS instead of returning success to - * the caller if we can steal from the global rsv - this is just to have - * caller fail immeditelly instead of later when trying to modify the - * fs, making it easier to debug -ENOSPC problems. + * above, in that case fail with the abort error instead of returning + * success to the caller if we can steal from the global rsv - this is + * just to have caller fail immeditelly instead of later when trying to + * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { - ticket->error = -EROFS; + ticket->error = BTRFS_FS_ERROR(fs_info); remove_ticket(space_info, ticket); } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; -- cgit v1.2.3 From 504b1596bd0554c481108e1e27ed5237ed58e60c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:07 +0100 Subject: btrfs: make btrfs_cleanup_fs_roots() static btrfs_cleanup_fs_roots() is not used outside disk-io.c, so make it static, remove its prototype from disk-io.h and move its definition above the where it's used in disk-io.c Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 100 ++++++++++++++++++++++++++--------------------------- fs/btrfs/disk-io.h | 1 - 2 files changed, 50 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ccc34c67b040..a38d3f9a1a03 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2869,6 +2869,56 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + + while (1) { + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots. */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* Grab all the search result for later use. */ + gang[i] = btrfs_grab_root(gang[i]); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +out: + /* Release the uncleaned roots due to error. */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); + } + return err; +} + /* * Some options only have meaning at mount time and shouldn't persist across * remounts, or be displayed. Clear these at the end of mount and remount @@ -4136,56 +4186,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, btrfs_put_root(root); } -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -{ - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; - - while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; - } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); - } - spin_unlock(&fs_info->fs_roots_radix_lock); - - for (i = 0; i < ret; i++) { - if (!gang[i]) - continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; - btrfs_put_root(gang[i]); - } - root_objectid++; - } -out: - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; -} - int btrfs_commit_super(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b03767f4d7ed..02b645744a82 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -77,7 +77,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 883647f4b5ca9a62ce43182f32e37e50286484a5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:08 +0100 Subject: btrfs: make find_free_dev_extent() static The function find_free_dev_extent() is only used within volumes.c, so make it static and remove its prototype from volumes.h. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- fs/btrfs/volumes.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89410a480293..72d8a3be7775 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1726,8 +1726,8 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) { /* FIXME use last free of some kind */ return find_free_dev_extent_start(device, num_bytes, 0, start, len); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b8c51f16ba86..a59898d51e9e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -650,8 +650,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); -- cgit v1.2.3 From ed8947bc73aa5bf0ddb5473eafadaa5d740f2f3f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:09 +0100 Subject: btrfs: merge find_free_dev_extent() and find_free_dev_extent_start() There is no point in having find_free_dev_extent() because it's just a simple wrapper around find_free_dev_extent_start() which always passes a value of 0 for the search_start argument. Since there are no other callers of find_free_dev_extent_start(), remove find_free_dev_extent() and rename find_free_dev_extent_start() to find_free_dev_extent(), removing its search_start argument because it's always 0. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72d8a3be7775..bfdfecee1afe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1439,18 +1439,18 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } -static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); + return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ - return ALIGN(start, device->zone_info->zone_size); + return 0; default: BUG(); } @@ -1582,15 +1582,15 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * correct usable device space, as device extent freed in current transaction * is not reported as available. */ -static int find_free_dev_extent_start(struct btrfs_device *device, - u64 num_bytes, u64 search_start, u64 *start, - u64 *len) +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; + u64 search_start; u64 hole_size; u64 max_hole_start; u64 max_hole_size; @@ -1600,7 +1600,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; - search_start = dev_extent_search_start(device, search_start); + search_start = dev_extent_search_start(device); WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); @@ -1726,13 +1726,6 @@ out: return ret; } -static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) -{ - /* FIXME use last free of some kind */ - return find_free_dev_extent_start(device, num_bytes, 0, start, len); -} - static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) -- cgit v1.2.3 From 2391245ac2875f784335b9148079c6e73639a5f7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:10 +0100 Subject: btrfs: avoid starting new transaction when flushing delayed items and refs When flushing space we join a transaction to flush delayed items and delayed references, in order to try to release space. However using btrfs_join_transaction() not only joins an existing transaction as well as it starts a new transaction if there is none open. If there is no transaction open, we don't have neither delayed items nor delayed references, so creating a new transaction is a waste of time, IO and creates an unnecessary rotation of the backup roots without gaining any benefits (including releasing space). So use btrfs_join_transaction_nostart() when attempting to flush delayed items and references. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index be5ce209b918..2db92a201697 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -725,9 +725,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, else nr = -1; - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_run_delayed_items_nr(trans, nr); @@ -743,9 +745,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } if (state == FLUSH_DELAYED_REFS_NR) -- cgit v1.2.3 From 2ee70ed19ccd9c95c188816b65dacf3fca1f5565 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:11 +0100 Subject: btrfs: avoid starting and committing empty transaction when flushing space When flushing space and we are in the COMMIT_TRANS state, we join a transaction with btrfs_join_transaction() and then commit the returned transaction. However btrfs_join_transaction() starts a new transaction if there is none currently open, which is pointless since comitting a new, empty transaction, doesn't achieve anything, it only wastes time, IO and creates an unnecessary rotation of the backup roots. So use btrfs_attach_transaction_barrier() to avoid starting a new transaction. This also waits for any ongoing transaction that is committing (state >= TRANS_STATE_COMMIT_DOING) to fully complete, and therefore wait for all the extents that were pinned during the transaction's lifetime to be unpinned. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2db92a201697..17c86db7b1b1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -814,9 +814,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case COMMIT_TRANS: ASSERT(current->journal_info == NULL); - trans = btrfs_join_transaction(root); + /* + * We don't want to start a new transaction, just attach to the + * current one or wait it fully commits in case its commit is + * happening at the moment. Note: we don't use a nostart join + * because that does not wait for a transaction to fully commit + * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). + */ + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_commit_transaction(trans); -- cgit v1.2.3 From 6705b48a50d7cad55b763ae4c9544875a9727f80 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:12 +0100 Subject: btrfs: avoid start and commit empty transaction when starting qgroup rescan When starting a qgroup rescan, we try to join a running transaction, with btrfs_join_transaction(), and then commit the transaction. However using btrfs_join_transaction() will result in creating a new transaction in case there isn't any running or if there's an existing one already committing. This is pointless as we only need to attach to an existing one that is not committing and in case there's an existing one committing, wait for its commit to complete. Creating and committing an empty transaction is wasteful, pointless IO and unnecessary rotation of the backup roots. So use btrfs_attach_transaction_barrier() instead, to avoid creating and committing empty transactions. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2637d6b157ff..1ef60ea8f0bc 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3590,15 +3590,16 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_join_transaction(fs_info->fs_root); - if (IS_ERR(trans)) { + trans = btrfs_attach_transaction_barrier(fs_info->fs_root); + if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; + } else if (trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_transaction(trans); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } } qgroup_rescan_zero_tracking(fs_info); -- cgit v1.2.3 From 9c93c238c15ffd8d5622c51a475d5fb35f65ca07 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Jul 2023 16:57:13 +0100 Subject: btrfs: avoid start and commit empty transaction when flushing qgroups When flushing qgroups, we try to join a running transaction, with btrfs_join_transaction(), and then commit the transaction. However using btrfs_join_transaction() will result in creating a new transaction in case there isn't any running or if there's an existing one already committing. This is pointless as we only need to attach to an existing one that is not committing and in case there's an existing one committing, wait for its commit to complete. Creating and committing an empty transaction is wasteful, pointless IO and unnecessary rotation of the backup roots. So use btrfs_attach_transaction_barrier() instead, to avoid creating and committing empty transactions. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1ef60ea8f0bc..b99230db3c82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3758,9 +3758,11 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; goto out; } -- cgit v1.2.3 From 861093eff4f01319edfc1d1ee276a7f2bf720f1d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:31 +0900 Subject: btrfs: introduce struct to consolidate extent buffer write context Introduce btrfs_eb_write_context to consolidate writeback_control and the exntent buffer context. This will help adding a block group context as well. While at it, move the eb context setting before btrfs_check_meta_write_pointer(). We can set it here because we anyway need to skip pages in the same eb if that eb is rejected by btrfs_check_meta_write_pointer(). Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++------- fs/btrfs/extent_io.h | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 36d80cb8cfa3..3ee83ff7f910 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1805,9 +1805,9 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_buffer **eb_context) +static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { + struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; @@ -1836,7 +1836,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; } - if (eb == *eb_context) { + if (eb == ctx->eb) { spin_unlock(&mapping->private_lock); return 0; } @@ -1845,6 +1845,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; + ctx->eb = eb; + if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { /* * If for_sync, this hole will be filled with @@ -1858,8 +1860,6 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } - *eb_context = eb; - if (!lock_extent_buffer_for_io(eb, wbc)) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) @@ -1882,7 +1882,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb_context = NULL; + struct btrfs_eb_write_context ctx = { .wbc = wbc }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -1924,7 +1924,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &eb_context); + ret = submit_eb_page(&folio->page, &ctx); if (ret == 0) continue; if (ret < 0) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index adda14c1b763..e243a8eac910 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -93,6 +93,11 @@ struct extent_buffer { #endif }; +struct btrfs_eb_write_context { + struct writeback_control *wbc; + struct extent_buffer *eb; +}; + /* * Get the correct offset inside the page of extent buffer. * -- cgit v1.2.3 From 7db94301a980c9da4168ac7ce61e7bde297306ba Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:32 +0900 Subject: btrfs: zoned: introduce block group context to btrfs_eb_write_context For metadata write out on the zoned mode, we call btrfs_check_meta_write_pointer() to check if an extent buffer to be written is aligned to the write pointer. We look up a block group containing the extent buffer for every extent buffer, which takes unnecessary effort as the writing extent buffers are mostly contiguous. Introduce "zoned_bg" to cache the block group working on. Also, while at it, rename "cache" to "block_group". Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 +++++++-------- fs/btrfs/extent_io.h | 2 ++ fs/btrfs/zoned.c | 35 ++++++++++++++++++++--------------- fs/btrfs/zoned.h | 6 ++---- 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3ee83ff7f910..cdd7118e5848 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1809,7 +1809,6 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; - struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -1847,7 +1846,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) ctx->eb = eb; - if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { + if (!btrfs_check_meta_write_pointer(eb->fs_info, ctx)) { /* * If for_sync, this hole will be filled with * trasnsaction commit. @@ -1861,18 +1860,15 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) } if (!lock_extent_buffer_for_io(eb, wbc)) { - btrfs_revert_meta_write_pointer(cache, eb); - if (cache) - btrfs_put_block_group(cache); + btrfs_revert_meta_write_pointer(ctx->zoned_bg, eb); free_extent_buffer(eb); return 0; } - if (cache) { + if (ctx->zoned_bg) { /* * Implies write in zoned mode. Mark the last eb in a block group. */ - btrfs_schedule_zone_finish_bg(cache, eb); - btrfs_put_block_group(cache); + btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); } write_one_eb(eb, wbc); free_extent_buffer(eb); @@ -1985,6 +1981,9 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; + + if (ctx.zoned_bg) + btrfs_put_block_group(ctx.zoned_bg); btrfs_zoned_meta_io_unlock(fs_info); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e243a8eac910..68368ba99321 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -96,6 +96,8 @@ struct extent_buffer { struct btrfs_eb_write_context { struct writeback_control *wbc; struct extent_buffer *eb; + /* Block group @eb resides in. Only used for zoned mode. */ + struct btrfs_block_group *zoned_bg; }; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b43c4536e685..da6bdbe68c7b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1748,30 +1748,35 @@ out: } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) + struct btrfs_eb_write_context *ctx) { - struct btrfs_block_group *cache; - bool ret = true; + const struct extent_buffer *eb = ctx->eb; + struct btrfs_block_group *block_group = ctx->zoned_bg; if (!btrfs_is_zoned(fs_info)) return true; - cache = btrfs_lookup_block_group(fs_info, eb->start); - if (!cache) - return true; + if (block_group) { + if (block_group->start > eb->start || + block_group->start + block_group->length <= eb->start) { + btrfs_put_block_group(block_group); + block_group = NULL; + ctx->zoned_bg = NULL; + } + } - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; + if (!block_group) { + block_group = btrfs_lookup_block_group(fs_info, eb->start); + if (!block_group) + return true; + ctx->zoned_bg = block_group; } - *cache_ret = cache; + if (block_group->meta_write_pointer != eb->start) + return false; + block_group->meta_write_pointer = eb->start + eb->len; - return ret; + return true; } void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 27322b926038..49d5bd87245c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -59,8 +59,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret); + struct btrfs_eb_write_context *ctx); void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); @@ -190,8 +189,7 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) } static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) + struct btrfs_eb_write_context *ctx) { return true; } -- cgit v1.2.3 From 2ad8c0510a965113404cfe670b41ddc34fb66100 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:33 +0900 Subject: btrfs: zoned: return int from btrfs_check_meta_write_pointer Now that we have writeback_control passed to btrfs_check_meta_write_pointer(), we can move the wbc condition in submit_eb_page() to btrfs_check_meta_write_pointer() and return int. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++-------- fs/btrfs/zoned.c | 30 ++++++++++++++++++++++-------- fs/btrfs/zoned.h | 10 +++++----- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdd7118e5848..84192a63511c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1846,14 +1846,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) ctx->eb = eb; - if (!btrfs_check_meta_write_pointer(eb->fs_info, ctx)) { - /* - * If for_sync, this hole will be filled with - * trasnsaction commit. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) - ret = -EAGAIN; - else + ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); + if (ret) { + if (ret == -EBUSY) ret = 0; free_extent_buffer(eb); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index da6bdbe68c7b..c8ff94176381 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1747,14 +1747,23 @@ out: } } -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct btrfs_eb_write_context *ctx) +/* + * Check if @ctx->eb is aligned to the write pointer. + * + * Return: + * 0: @ctx->eb is at the write pointer. You can write it. + * -EAGAIN: There is a hole. The caller should handle the case. + * -EBUSY: There is a hole, but the caller can just bail out. + */ +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { + const struct writeback_control *wbc = ctx->wbc; const struct extent_buffer *eb = ctx->eb; struct btrfs_block_group *block_group = ctx->zoned_bg; if (!btrfs_is_zoned(fs_info)) - return true; + return 0; if (block_group) { if (block_group->start > eb->start || @@ -1768,15 +1777,20 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (!block_group) { block_group = btrfs_lookup_block_group(fs_info, eb->start); if (!block_group) - return true; + return 0; ctx->zoned_bg = block_group; } - if (block_group->meta_write_pointer != eb->start) - return false; - block_group->meta_write_pointer = eb->start + eb->len; + if (block_group->meta_write_pointer == eb->start) { + block_group->meta_write_pointer = eb->start + eb->len; - return true; + return 0; + } + + /* If for_sync, this hole will be filled with trasnsaction commit. */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + return -EAGAIN; + return -EBUSY; } void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 49d5bd87245c..c0859d8be152 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -58,8 +58,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct btrfs_eb_write_context *ctx); +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx); void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); @@ -188,10 +188,10 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct btrfs_eb_write_context *ctx) +static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { - return true; + return 0; } static inline void btrfs_revert_meta_write_pointer( -- cgit v1.2.3 From 0356ad41e0ddb8cf0ea4d68820c92598413e445b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:34 +0900 Subject: btrfs: zoned: defer advancing meta write pointer We currently advance the meta_write_pointer in btrfs_check_meta_write_pointer(). That makes it necessary to revert it when locking the buffer failed. Instead, we can advance it just before sending the buffer. Also, this is necessary for the following commit. In the commit, it needs to release the zoned_meta_io_lock to allow IOs to come in and wait for them to fill the currently active block group. If we advance the meta_write_pointer before locking the extent buffer, the following extent buffer can pass the meta_write_pointer check, resulting in an unaligned write failure. Advancing the pointer is still thread-safe as the extent buffer is locked. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/zoned.c | 15 +-------------- fs/btrfs/zoned.h | 8 -------- 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 84192a63511c..ac3fca5a5e41 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1855,15 +1855,14 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) } if (!lock_extent_buffer_for_io(eb, wbc)) { - btrfs_revert_meta_write_pointer(ctx->zoned_bg, eb); free_extent_buffer(eb); return 0; } + /* Implies write in zoned mode. */ if (ctx->zoned_bg) { - /* - * Implies write in zoned mode. Mark the last eb in a block group. - */ + /* Mark the last eb in the block group. */ btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); + ctx->zoned_bg->meta_write_pointer += eb->len; } write_one_eb(eb, wbc); free_extent_buffer(eb); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c8ff94176381..9af3cd90ed26 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1781,11 +1781,8 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, ctx->zoned_bg = block_group; } - if (block_group->meta_write_pointer == eb->start) { - block_group->meta_write_pointer = eb->start + eb->len; - + if (block_group->meta_write_pointer == eb->start) return 0; - } /* If for_sync, this hole will be filled with trasnsaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) @@ -1793,16 +1790,6 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, return -EBUSY; } -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb) -{ - if (!btrfs_is_zoned(eb->fs_info) || !cache) - return; - - ASSERT(cache->meta_write_pointer == eb->start + eb->len); - cache->meta_write_pointer = eb->start; -} - int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) { if (!btrfs_dev_is_sequential(device, physical)) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c0859d8be152..74ec37a25808 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -60,8 +60,6 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct btrfs_eb_write_context *ctx); -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); @@ -194,12 +192,6 @@ static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, return 0; } -static inline void btrfs_revert_meta_write_pointer( - struct btrfs_block_group *cache, - struct extent_buffer *eb) -{ -} - static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) { -- cgit v1.2.3 From c1c3c2bc2917d09bbb19cb9894ba5950254b9502 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:35 +0900 Subject: btrfs: zoned: update meta write pointer on zone finish On finishing a zone, the meta_write_pointer should be set of the end of the zone to reflect the actual write pointer position. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9af3cd90ed26..d7188680c8c6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2049,6 +2049,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; + if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) + block_group->meta_write_pointer = block_group->start + + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); -- cgit v1.2.3 From a7e1ac7bdc5af91af2d52e6269fdbd92fe9ee353 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:36 +0900 Subject: btrfs: zoned: reserve zones for an active metadata/system block group Ensure a metadata and system block group can be activated on write time, by leaving a certain number of active zones when trying to activate a data block group. Zones for two metadata block groups (normal and tree-log) and one system block group are reserved, according to the profile type: two zones per block group on the DUP profile and one zone per block group otherwise. The reservation must be freed once a non-data block group is allocated. If not, we over-reserve the active zones and data block group activation will suffer. For the dynamic reservation count, we need to manage the reservation count per device. The reservation count variable is protected by fs_info->zone_active_bgs_lock. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/zoned.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/zoned.h | 8 +++++ 3 files changed, 98 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a38d3f9a1a03..5022a3fd778d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3467,6 +3467,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_zone_cache(fs_info); + btrfs_check_active_zone_reservation(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d7188680c8c6..fc69041bb6b4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1889,6 +1889,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) struct map_lookup *map; struct btrfs_device *device; u64 physical; + const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; @@ -1910,19 +1911,40 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { + struct btrfs_zoned_device_info *zinfo; + int reserved = 0; + device = map->stripes[i].dev; physical = map->stripes[i].physical; + zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; + if (is_data) + reserved = zinfo->reserved_active_zones; + /* + * For the data block group, leave active zones for one + * metadata block group and one system block group. + */ + if (atomic_read(&zinfo->active_zones_left) <= reserved) { + ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); + goto out_unlock; + } + if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } + if (!is_data) + zinfo->reserved_active_zones--; } + spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2061,18 +2083,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) return ret; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } @@ -2111,8 +2136,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); + spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; + int reserved = 0; if (!device->bdev) continue; @@ -2122,17 +2149,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } + if (flags & BTRFS_BLOCK_GROUP_DATA) + reserved = zinfo->reserved_active_zones; + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - ret = (atomic_read(&zinfo->active_zones_left) >= 1); + ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: - ret = (atomic_read(&zinfo->active_zones_left) >= 2); + ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } + spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) @@ -2374,3 +2405,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } + +/* + * Reserve zones for one metadata block group, one tree-log block group, and one + * system block group. + */ +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *block_group; + struct btrfs_device *device; + /* Reserve zones for normal SINGLE metadata and tree-log block group. */ + unsigned int metadata_reserve = 2; + /* Reserve a zone for SINGLE system block group. */ + unsigned int system_reserve = 1; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return; + + /* + * This function is called from the mount context. So, there is no + * parallel process touching the bits. No need for read_seqretry(). + */ + if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + metadata_reserve = 4; + if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + system_reserve = 2; + + /* Apply the reservation on all the devices. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + device->zone_info->reserved_active_zones = + metadata_reserve + system_reserve; + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* Release reservation for currently active block groups. */ + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + struct map_lookup *map = block_group->physical_map; + + if (!(block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + continue; + + for (int i = 0; i < map->num_stripes; i++) + map->stripes[i].dev->zone_info->reserved_active_zones--; + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 74ec37a25808..b9cec523b778 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,6 +22,11 @@ struct btrfs_zoned_device_info { u8 zone_size_shift; u32 nr_zones; unsigned int max_active_zones; + /* + * Reserved active zones for one metadata and one system block group. + * It can vary per-device depending on the allocation status. + */ + int reserved_active_zones; atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; @@ -78,6 +83,7 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -252,6 +258,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } +static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3 From 13bb483d32abb6f8ebd40141d87eb68f11cc2dd2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:37 +0900 Subject: btrfs: zoned: activate metadata block group on write time In the current implementation, block groups are activated at reservation time to ensure that all reserved bytes can be written to an active metadata block group. However, this approach has proven to be less efficient, as it activates block groups more frequently than necessary, putting pressure on the active zone resource and leading to potential issues such as early ENOSPC or hung_task. Another drawback of the current method is that it hampers metadata over-commit, and necessitates additional flush operations and block group allocations, resulting in decreased overall performance. To address these issues, this commit introduces a write-time activation of metadata and system block group. This involves reserving at least one active block group specifically for a metadata and system block group. Since metadata write-out is always allocated sequentially, when we need to write to a non-active block group, we can wait for the ongoing IOs to complete, activate a new block group, and then proceed with writing to the new block group. Fixes: b09315139136 ("btrfs: zoned: activate metadata block group on flush_space") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 +++++++ fs/btrfs/fs.h | 3 ++ fs/btrfs/zoned.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a127865f49f9..b0e432c30e1d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4287,6 +4287,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; + if (btrfs_is_zoned(info)) { + if (info->active_meta_bg) { + btrfs_put_block_group(info->active_meta_bg); + info->active_meta_bg = NULL; + } + if (info->active_system_bg) { + btrfs_put_block_group(info->active_system_bg); + info->active_system_bg = NULL; + } + } + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index ef07c6c252d8..a523d64d5491 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -770,6 +770,9 @@ struct btrfs_fs_info { u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; + struct btrfs_block_group *active_meta_bg; + struct btrfs_block_group *active_system_bg; + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fc69041bb6b4..099cb6a6d3b3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -65,6 +65,9 @@ #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) +static void wait_eb_writebacks(struct btrfs_block_group *block_group); +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); + static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || @@ -1747,6 +1750,62 @@ out: } } +static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, + struct btrfs_block_group **active_bg) +{ + const struct writeback_control *wbc = ctx->wbc; + struct btrfs_block_group *block_group = ctx->zoned_bg; + struct btrfs_fs_info *fs_info = block_group->fs_info; + + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + return true; + + if (fs_info->treelog_bg == block_group->start) { + if (!btrfs_zone_activate(block_group)) { + int ret_fin = btrfs_zone_finish_one_bg(fs_info); + + if (ret_fin != 1 || !btrfs_zone_activate(block_group)) + return false; + } + } else if (*active_bg != block_group) { + struct btrfs_block_group *tgt = *active_bg; + + /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ + lockdep_assert_held(&fs_info->zoned_meta_io_lock); + + if (tgt) { + /* + * If there is an unsent IO left in the allocated area, + * we cannot wait for them as it may cause a deadlock. + */ + if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { + if (wbc->sync_mode == WB_SYNC_NONE || + (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) + return false; + } + + /* Pivot active metadata/system block group. */ + btrfs_zoned_meta_io_unlock(fs_info); + wait_eb_writebacks(tgt); + do_zone_finish(tgt, true); + btrfs_zoned_meta_io_lock(fs_info); + if (*active_bg == tgt) { + btrfs_put_block_group(tgt); + *active_bg = NULL; + } + } + if (!btrfs_zone_activate(block_group)) + return false; + if (*active_bg != block_group) { + ASSERT(*active_bg == NULL); + *active_bg = block_group; + btrfs_get_block_group(block_group); + } + } + + return true; +} + /* * Check if @ctx->eb is aligned to the write pointer. * @@ -1781,8 +1840,26 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, ctx->zoned_bg = block_group; } - if (block_group->meta_write_pointer == eb->start) - return 0; + if (block_group->meta_write_pointer == eb->start) { + struct btrfs_block_group **tgt; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return 0; + + if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) + tgt = &fs_info->active_system_bg; + else + tgt = &fs_info->active_meta_bg; + if (check_bg_is_active(ctx, tgt)) + return 0; + } + + /* + * Since we may release fs_info->zoned_meta_io_lock, someone can already + * start writing this eb. In that case, we can just bail out. + */ + if (block_group->meta_write_pointer > eb->start) + return -EBUSY; /* If for_sync, this hole will be filled with trasnsaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) -- cgit v1.2.3 From 6a8ebc773ef64c8f12d6d60fd6e53d5ccc81314b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:38 +0900 Subject: btrfs: zoned: no longer count fresh BG region as zone unusable Now that we switched to write time activation, we no longer need to (and must not) count the fresh region as zone unusable. This commit is similar to revert of commit fa2068d7e922b434eb ("btrfs: zoned: count fresh BG region as zone unusable"). Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 8 +------- fs/btrfs/zoned.c | 26 +++----------------------- 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cd5bfda2c259..27fad70451aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2704,13 +2704,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); - /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; - else if (initial && - test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && - (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) - to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2738,8 +2733,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length && - block_group->alloc_offset) { + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 099cb6a6d3b3..52f49b0d2dee 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1586,19 +1586,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - - /* Check for block groups never get activated */ - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && - cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && - !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && - cache->alloc_offset == 0) { - unusable = cache->length; - free = 0; - } else { - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; - } + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1962,7 +1952,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; @@ -1975,7 +1964,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; - spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2025,14 +2013,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - WARN_ON(block_group->alloc_offset != 0); - if (block_group->zone_unusable == block_group->length) { - block_group->zone_unusable = block_group->length - block_group->zone_capacity; - space_info->bytes_zone_unusable -= block_group->zone_capacity; - } spin_unlock(&block_group->lock); - btrfs_try_granting_tickets(fs_info, space_info); - spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -2045,7 +2026,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); return ret; } -- cgit v1.2.3 From 5a7d107e5ef9b452cf0aa96ac931b29ea980d997 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:39 +0900 Subject: btrfs: zoned: don't activate non-DATA BG on allocation Now that a non-DATA block group is activated at write time, don't activate it on allocation time. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/extent-tree.c | 8 +++++++- fs/btrfs/space-info.c | 28 ---------------------------- 3 files changed, 8 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b0e432c30e1d..0cb1dee965a0 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4089,7 +4089,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (IS_ERR(ret_bg)) { ret = PTR_ERR(ret_bg); - } else if (from_extent_allocation) { + } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { /* * New block group is likely to be used soon. Try to activate * it now. Failure is OK for now. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6dfe086523f9..bc9ff192196f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3690,7 +3690,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, } spin_unlock(&block_group->lock); - if (!ret && !btrfs_zone_activate(block_group)) { + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3866,6 +3868,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 17c86db7b1b1..356638f54fef 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -761,18 +761,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: - /* - * For metadata space on zoned filesystem, reaching here means we - * don't have enough space left in active_total_bytes. Try to - * activate a block group first, because we may have inactive - * block group already allocated. - */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); - if (ret < 0) - break; - else if (ret == 1) - break; - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -784,22 +772,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); - /* - * For metadata space on zoned filesystem, allocating a new chunk - * is not enough. We still need to activate the block * group. - * Active the newly allocated block group by (maybe) finishing - * a block group. - */ - if (ret == 1) { - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); - /* - * Revert to the original ret regardless we could finish - * one block group or not. - */ - if (ret >= 0) - ret = 1; - } - if (ret > 0 || ret == -ENOSPC) ret = 0; break; -- cgit v1.2.3 From 5b135b382a360f4c87cf8896d1465b0b07f10cb0 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 8 Aug 2023 01:12:40 +0900 Subject: btrfs: zoned: re-enable metadata over-commit for zoned mode Now that, we can re-enable metadata over-commit. As we moved the activation from the reservation time to the write time, we no longer need to ensure all the reserved bytes is properly activated. Without the metadata over-commit, it suffers from lower performance because it needs to flush the delalloc items more often and allocate more block groups. Re-enabling metadata over-commit will solve the issue. Fixes: 79417d040f4f ("btrfs: zoned: disable metadata overcommit for zoned") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 356638f54fef..d7e8cd4f140c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -389,11 +389,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && - (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) - avail = 0; - else - avail = calc_available_free_space(fs_info, space_info, flush); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; -- cgit v1.2.3 From 257614301a5db9f7b0548584ca207ad7785c8b89 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 9 Aug 2023 15:08:21 +0800 Subject: btrfs: handle errors properly in update_inline_extent_backref() [PROBLEM] Inside function update_inline_extent_backref(), we have several BUG_ON()s along with some ASSERT()s which can be triggered by corrupted filesystem. [ANAYLYSE] Most of those BUG_ON()s and ASSERT()s are just a way of handling unexpected on-disk data. Although we have tree-checker to rule out obviously incorrect extent tree blocks, it's not enough for these ones. Thus we need proper error handling for them. [FIX] Thankfully all the callers of update_inline_extent_backref() would eventually handle the errror by aborting the current transaction. So this patch would do the proper error handling by: - Make update_inline_extent_backref() to return int The return value would be either 0 or -EUCLEAN. - Replace BUG_ON()s and ASSERT()s with proper error handling This includes: * Dump the bad extent tree leaf * Output an error message for the cause This would include the extent bytenr, num_bytes (if needed), the bad values and expected good values. * Return -EUCLEAN Note here we remove all the WARN_ON()s, as eventually the transaction would be aborted, thus a backtrace would be triggered anyway. - Better comments on why we expect refs == 1 and refs_to_mode == -1 for tree blocks Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 73 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bc9ff192196f..3c62a7eaf24b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -381,11 +381,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } + WARN_ON(1); btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); - WARN_ON(1); return BTRFS_REF_TYPE_INVALID; } @@ -1058,13 +1058,13 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1077,18 +1077,33 @@ void update_inline_extent_backref(struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1098,10 +1113,43 @@ void update_inline_extent_backref(struct btrfs_path *path, refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; - BUG_ON(refs_to_mod != -1); + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; if (refs > 0) { @@ -1121,6 +1169,7 @@ void update_inline_extent_backref(struct btrfs_path *path, btrfs_truncate_item(path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); + return 0; } static noinline_for_stack @@ -1149,7 +1198,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1169,7 +1218,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else -- cgit v1.2.3 From 84af994b85b89ae405b8ea930653543bc5b864d3 Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Thu, 10 Aug 2023 11:00:22 +0800 Subject: btrfs: use LIST_HEAD() to initialize the list_head Use LIST_HEAD() to initialize the list_head instead of open-coding it. Signed-off-by: Ruan Jinjie Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++--------- fs/btrfs/file.c | 3 +-- fs/btrfs/inode.c | 17 +++++------------ fs/btrfs/ordered-data.c | 4 +--- fs/btrfs/send.c | 6 ++---- fs/btrfs/tree-log.c | 4 +--- 6 files changed, 13 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5022a3fd778d..48faf9c968d0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4554,9 +4554,7 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -4662,9 +4660,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -4697,9 +4693,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fd03e689a6be..6edad7b9a5d3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3018,7 +3018,7 @@ static long btrfs_fallocate(struct file *file, int mode, struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; - struct list_head reserve_list; + LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -3110,7 +3110,6 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ - INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 431007e443a8..f2ab27084cc4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5861,8 +5861,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_key found_key; struct btrfs_path *path; void *addr; - struct list_head ins_list; - struct list_head del_list; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; @@ -5881,8 +5881,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); @@ -9244,14 +9242,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; - struct list_head works; - struct list_head splice; + LIST_HEAD(works); + LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; - INIT_LIST_HEAD(&works); - INIT_LIST_HEAD(&splice); - mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -9339,14 +9334,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, .range_end = LLONG_MAX, }; struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 109e80ed25b6..b46ab348e8e5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -740,11 +740,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); u64 done; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8bfd44750efe..3a566150c531 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3685,7 +3685,7 @@ static void tail_append_pending_moves(struct send_ctx *sctx, static int apply_children_dir_moves(struct send_ctx *sctx) { struct pending_dir_move *pm; - struct list_head stack; + LIST_HEAD(stack); u64 parent_ino = sctx->cur_ino; int ret = 0; @@ -3693,7 +3693,6 @@ static int apply_children_dir_moves(struct send_ctx *sctx) if (!pm) return 0; - INIT_LIST_HEAD(&stack); tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { @@ -4165,7 +4164,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct list_head check_dirs; + LIST_HEAD(check_dirs); struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -4184,7 +4183,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - INIT_LIST_HEAD(&check_dirs); valid_path = fs_path_alloc(); if (!valid_path) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ad7e7e38d18..b9229c08164f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4841,13 +4841,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; - struct list_head extents; + LIST_HEAD(extents); struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; int num = 0; - INIT_LIST_HEAD(&extents); - write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) { -- cgit v1.2.3 From cd361199ff23776481c37023a55d855d5ad5c0f5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 31 Jul 2023 16:28:43 -0400 Subject: btrfs: wait on uncached block groups on every allocation loop My initial fix for the generic/475 hangs was related to metadata, but our CI testing uncovered another case where we hang for similar reasons. We again have a task with a plug that is holding an outstanding request that is keeping the dm device from finishing it's suspend, and that task is stuck in the allocator. This time it is stuck trying to allocate data, but we do not have a block group that matches the size class. The larger loop in the allocator looks like this (simplified of course) find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; do_allocation() btrfs_wait_block_group_cache_progress(); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; In my earlier fix we were trying to allocate from the block group, but we weren't waiting for the progress because we were only waiting for the free space to be >= the amount of free space we wanted. My fix made it so we waited for forward progress to be made as well, so we would be sure to wait. This time however we did not have a block group that matched our size class, so what was happening was this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; The size_class_doesn't_match() part was true, so we'd just skip this block group and never wait for caching, and then because we found a caching block group we'd just go back and do the loop again. We never sleep and thus never flush the plug and we have the same deadlock. Fix the logic for waiting on the block group caching to instead do it unconditionally when we goto loop. This takes the logic out of the allocation step, so now the loop looks more like this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached && !ffe_ctl->cached) { ffe_ctl->retry_uncached = true; btrfs_wait_block_group_cache_progress(); } release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; This simplifies the logic a lot, and makes sure that if we're hitting uncached block groups we're always waiting on them at some point. I ran this through 100 iterations of generic/475, as this particular case was harder to hit than the previous one. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 61 +++++++++++++++----------------------------------- fs/btrfs/extent-tree.h | 13 ++++------- 2 files changed, 22 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3c62a7eaf24b..ba22790f95f7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3481,7 +3481,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ @@ -3562,14 +3561,6 @@ refill_cluster: trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); - - ffe_ctl->retry_clustered = true; - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; } /* * At this point we either didn't find a cluster or we weren't able to @@ -3584,7 +3575,6 @@ refill_cluster: /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) @@ -3622,25 +3612,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); - - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { + if (!offset) return 1; - } ffe_ctl->found_offset = offset; return 0; } @@ -3654,7 +3627,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); - if (ret >= 0 || ret == -EAGAIN) + if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } @@ -3872,8 +3845,7 @@ static void release_block_group(struct btrfs_block_group *block_group, { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ @@ -4220,9 +4192,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; - /* For clustered allocation */ - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; @@ -4373,16 +4343,12 @@ have_block_group: bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { - if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, - ffe_ctl->delalloc); - block_group = bg_ret; - } - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { + if (ret > 0) goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; } /* Checks */ @@ -4423,6 +4389,15 @@ have_block_group: btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index b9e148adcd28..88c249c37516 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -48,16 +48,11 @@ struct find_free_extent_ctl { int loop; /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; + bool retry_uncached; /* If current block group is cached */ int cached; -- cgit v1.2.3 From e7f1326cc24e22b38afc3acd328480a1183f9e79 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 31 Jul 2023 11:13:00 -0400 Subject: btrfs: set page extent mapped after read_folio in relocate_one_page One of the CI runs triggered the following panic assertion failed: PagePrivate(page) && page->private, in fs/btrfs/subpage.c:229 ------------[ cut here ]------------ kernel BUG at fs/btrfs/subpage.c:229! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP CPU: 0 PID: 923660 Comm: btrfs Not tainted 6.5.0-rc3+ #1 pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : btrfs_subpage_assert+0xbc/0xf0 lr : btrfs_subpage_assert+0xbc/0xf0 sp : ffff800093213720 x29: ffff800093213720 x28: ffff8000932138b4 x27: 000000000c280000 x26: 00000001b5d00000 x25: 000000000c281000 x24: 000000000c281fff x23: 0000000000001000 x22: 0000000000000000 x21: ffffff42b95bf880 x20: ffff42b9528e0000 x19: 0000000000001000 x18: ffffffffffffffff x17: 667274622f736620 x16: 6e69202c65746176 x15: 0000000000000028 x14: 0000000000000003 x13: 00000000002672d7 x12: 0000000000000000 x11: ffffcd3f0ccd9204 x10: ffffcd3f0554ae50 x9 : ffffcd3f0379528c x8 : ffff800093213428 x7 : 0000000000000000 x6 : ffffcd3f091771e8 x5 : ffff42b97f333948 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : ffff42b9556cde80 x0 : 000000000000004f Call trace: btrfs_subpage_assert+0xbc/0xf0 btrfs_subpage_set_dirty+0x38/0xa0 btrfs_page_set_dirty+0x58/0x88 relocate_one_page+0x204/0x5f0 relocate_file_extent_cluster+0x11c/0x180 relocate_data_extent+0xd0/0xf8 relocate_block_group+0x3d0/0x4e8 btrfs_relocate_block_group+0x2d8/0x490 btrfs_relocate_chunk+0x54/0x1a8 btrfs_balance+0x7f4/0x1150 btrfs_ioctl+0x10f0/0x20b8 __arm64_sys_ioctl+0x120/0x11d8 invoke_syscall.constprop.0+0x80/0xd8 do_el0_svc+0x6c/0x158 el0_svc+0x50/0x1b0 el0t_64_sync_handler+0x120/0x130 el0t_64_sync+0x194/0x198 Code: 91098021 b0007fa0 91346000 97e9c6d2 (d4210000) This is the same problem outlined in 17b17fcd6d44 ("btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand") , and the fix is the same. I originally looked for the same pattern elsewhere in our code, but mistakenly skipped over this code because I saw the page cache readahead before we set_page_extent_mapped, not realizing that this was only in the !page case, that we can still end up with a !uptodate page and then do the btrfs_read_folio further down. The fix here is the same as the above mentioned patch, move the set_page_extent_mapped call to after the btrfs_read_folio() block to make sure that we have the subpage blocksize stuff setup properly before using the page. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8ff0e9dcf85c..7408e48d45bc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, if (!page) return -ENOMEM; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto release_page; if (PageReadahead(page)) page_cache_async_readahead(inode->i_mapping, ra, NULL, @@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, } } + /* + * We could have lost page private when we dropped the lock to read the + * page above, make sure we set_page_extent_mapped here so we have any + * of the subpage blocksize stuff we need in place. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; -- cgit v1.2.3 From 332581bde2a419d5f12a93a1cdc2856af649a3cc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 21 Jul 2023 16:42:14 +0900 Subject: btrfs: zoned: do not zone finish data relocation block group When multiple writes happen at once, we may need to sacrifice a currently active block group to be zone finished for a new allocation. We choose a block group with the least free space left, and zone finish it. To do the finishing, we need to send IOs for already allocated region and wait for them and on-going IOs. Otherwise, these IOs fail because the zone is already finished at the time the IO reach a device. However, if a block group dedicated to the data relocation is zone finished, there is a chance that finishing it before an ongoing write IO reaches the device. That is because there is timing gap between an allocation is done (block_group->reservations == 0, as pre-allocation is done) and an ordered extent is created when the relocation IO starts. Thus, if we finish the zone between them, we can fail the IOs. We cannot simply use "fs_info->data_reloc_bg == block_group->start" to avoid the zone finishing. Because, the data_reloc_bg may already switch to a new block group, while there are still ongoing write IOs to the old data_reloc_bg. So, this patch reworks the BLOCK_GROUP_FLAG_ZONED_DATA_RELOC bit to indicate there is a data relocation allocation and/or ongoing write to the block group. The bit is set on allocation and cleared in end_io function of the last IO for the currently allocated region. To change the timing of the bit setting also solves the issue that the bit being left even after there is no IO going on. With the current code, if the data_reloc_bg switches after the last IO to the current data_reloc_bg, the bit is set at this timing and there is no one clearing that bit. As a result, that block group is kept unallocatable for anything. Fixes: 343d8a30851c ("btrfs: zoned: prevent allocation from previous data relocation BG") Fixes: 74e91b12b115 ("btrfs: zoned: zone finish unused block group") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 43 +++++++++++++++++++++++-------------------- fs/btrfs/zoned.c | 16 +++++++++++++--- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ba22790f95f7..e0508114f0bd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3738,7 +3738,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, fs_info->data_reloc_bg == 0); if (block_group->ro || - test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } @@ -3781,8 +3782,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; - if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) - fs_info->data_reloc_bg = block_group->start; + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; @@ -3800,24 +3819,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc && - fs_info->data_reloc_bg == block_group->start) { - /* - * Do not allow further allocations from this block group. - * Compared to increasing the ->ro, setting the - * ->zoned_data_reloc_ongoing flag still allows nocow - * writers to come in. See btrfs_inc_nocow_writers(). - * - * We need to disable an allocation to avoid an allocation of - * regular (non-relocation data) extent. With mix of relocation - * extents and regular extents, we can dispatch WRITE commands - * (for relocation extents) and ZONE APPEND commands (for - * regular extents) at the same time to the same zone, which - * easily break the write pointer. - */ - set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; - } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 52f49b0d2dee..a2f8e7440f8c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2091,6 +2091,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { + if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); @@ -2119,7 +2123,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ return 0; } - if (block_group->reserved) { + if (block_group->reserved || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; @@ -2362,7 +2368,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { - /* Now, release this block group for further allocations. */ + /* + * Now, release this block group for further allocations and + * zone finish. + */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } @@ -2386,7 +2395,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } -- cgit v1.2.3 From 953fa5ced510c4937075002bbfc6405cd500efef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 07:22:38 -0700 Subject: btrfs: fix error handling when in a COW window in run_delalloc_nocow When run_delalloc_nocow has cow_start set to a value other than (u64)-1, it has delayed COW writeback pending behind cur_offset. When an error occurs in such a window, the range going back to cow_start and not just cur_offset needs to be unlocked, but only two error cases handle this correctly Move the code to handle unlock the COW range to the common error handling label and document the logic. To make things even more complicated, cow_file_range as called by fallback_to_cow will unlock the range it is operating on when it fails as well, so we need to reset cow_start right after caling fallback_to_cow instead of only when it succeeded. Reviewed-by: Boris Burkov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f2ab27084cc4..0d973a959559 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2029,11 +2029,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -2096,13 +2093,10 @@ next_slot: nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } else if (ret == 0) { + if (ret == 0) goto out_check; - } ret = 0; bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); @@ -2133,9 +2127,9 @@ out_check: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1); + cow_start = (u64)-1; if (ret) goto error; - cow_start = (u64)-1; } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2214,6 +2208,7 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; if (ret) goto error; } @@ -2222,6 +2217,13 @@ error: if (nocow) btrfs_dec_nocow_writers(bg); + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | -- cgit v1.2.3 From 18f62b86c4ea943256f8962ecd0cb013c97b4e0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 07:22:39 -0700 Subject: btrfs: cleanup the COW fallback logic in run_delalloc_nocow Use the block group pointer used to track the outstanding NOCOW writes as a boolean to remove the duplicate nocow variable, and keep it contained in the main loop to simplify the logic. Reviewed-by: Boris Burkov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0d973a959559..1fb1e4ac4a8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1974,8 +1974,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); - struct btrfs_block_group *bg; - bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; path = btrfs_alloc_path(); @@ -1993,6 +1991,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.writeback_path = true; while (1) { + struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; @@ -2003,8 +2002,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int extent_type; bool is_prealloc; - nocow = false; - ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) @@ -2063,7 +2060,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; - goto out_check; + goto must_cow; } /* @@ -2096,18 +2093,19 @@ next_slot: if (ret < 0) goto error; if (ret == 0) - goto out_check; + goto must_cow; ret = 0; - bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); - if (bg) - nocow = true; -out_check: - /* - * If nocow is false then record the beginning of the range - * that needs to be COWed - */ - if (!nocow) { + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; @@ -2128,8 +2126,10 @@ out_check: ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1); cow_start = (u64)-1; - if (ret) + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); goto error; + } } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2146,6 +2146,7 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } @@ -2159,6 +2160,7 @@ out_check: ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, @@ -2168,11 +2170,6 @@ out_check: goto error; } - if (nocow) { - btrfs_dec_nocow_writers(bg); - nocow = false; - } - if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent @@ -2213,10 +2210,10 @@ out_check: goto error; } -error: - if (nocow) - btrfs_dec_nocow_writers(bg); + btrfs_free_path(path); + return 0; +error: /* * If an error happened while a COW region is outstanding, cur_offset * needs to be reset to cow_start to ensure the COW region is unlocked @@ -2224,7 +2221,7 @@ error: */ if (cow_start != (u64)-1) cur_offset = cow_start; - if (ret && cur_offset < end) + if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | -- cgit v1.2.3 From 38dc88890de6cae3db5a33ece287e56fc9cc4ce6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 07:22:40 -0700 Subject: btrfs: consolidate the error handling in run_delalloc_nocow Share the calls to extent_clear_unlock_delalloc for btrfs_path allocation failure handling and the normal exit path. This relies on btrfs_free_path ignoring a NULL pointer, and the initialization of cur_offset to start at the beginning of the function. Reviewed-by: Boris Burkov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1fb1e4ac4a8c..8d0dd266dd1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1978,13 +1978,8 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - return -ENOMEM; + ret = -ENOMEM; + goto error; } nocow_args.end = end; -- cgit v1.2.3 From 76c5126e76696e61ced5d6e34441f857733741bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 07:22:41 -0700 Subject: btrfs: move the !zoned assert into run_delalloc_cow Having the assert in the actual helper documents the pre-conditions much better than having it in the caller, so move it. Reviewed-by: Boris Burkov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8d0dd266dd1b..ca0f4781b0e5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1976,6 +1976,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -2257,14 +2264,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - /* - * Normally on a zoned device we're only doing COW writes, but - * in case of relocation on a zoned filesystem we have taken - * precaution, that we're only writing sequentially. It's safe - * to use run_delalloc_nocow() here, like for regular - * preallocated inodes. - */ - ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } -- cgit v1.2.3 From 7f72f50547b7af4ddf985b07fc56600a4deba281 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Aug 2023 19:02:28 +0800 Subject: btrfs: output extra debug info if we failed to find an inline backref [BUG] Syzbot reported several warning triggered inside lookup_inline_extent_backref(). [CAUSE] As usual, the reproducer doesn't reliably trigger locally here, but at least we know the WARN_ON() is triggered when an inline backref can not be found, and it can only be triggered when @insert is true. (I.e. inserting a new inline backref, which means the backref should already exist) [ENHANCEMENT] After the WARN_ON(), dump all the parameters and the extent tree leaf to help debug. Link: https://syzkaller.appspot.com/bug?extid=d6f9ff86c1d804ba2bc6 Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0508114f0bd..105f528382e7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -848,6 +848,11 @@ again: err = -ENOENT; goto out; } else if (WARN_ON(ret)) { + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); err = -EIO; goto out; } -- cgit v1.2.3 From 182741d287fb1ea870ee6ef45aa1915a0b031233 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 Aug 2023 19:02:11 +0800 Subject: btrfs: remove v0 extent handling The v0 extent item has been deprecated for a long time, and we don't have any report from the community either. So it's time to remove the v0 extent specific error handling, and just treat them as regular extent tree corruption. This patch would remove the btrfs_print_v0_err() helper, and enhance the involved error handling to treat them just as any extent tree corruption. No reports regarding v0 extents have been seen since the graceful handling was added in 2018. This involves: - btrfs_backref_add_tree_node() This change is a little tricky, the new code is changed to only handle BTRFS_TREE_BLOCK_REF_KEY and BTRFS_SHARED_BLOCK_REF_KEY. But this is safe, as we have rejected any unknown inline refs through btrfs_get_extent_inline_ref_type(). For keyed backrefs, we're safe to skip anything we don't know (that's if it can pass tree-checker in the first place). - btrfs_lookup_extent_info() - lookup_inline_extent_backref() - run_delayed_extent_op() - __btrfs_free_extent() - add_tree_block() Regular error handling of unexpected extent tree item, and abort transaction (if we have a trans handle). - remove_extent_data_ref() It's pretty much the same as the regular rejection of unknown backref key. But for this particular case, we can also remove a BUG_ON(). - extent_data_ref_count() We can remove the BTRFS_EXTENT_REF_V0_KEY BUG_ON(), as it would be rejected by the only caller. - btrfs_print_leaf() Remove the handling for BTRFS_EXTENT_REF_V0_KEY. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 29 ++++++++++++----------------- fs/btrfs/extent-tree.c | 35 +++++++++++++++++++++-------------- fs/btrfs/messages.c | 6 ------ fs/btrfs/messages.h | 2 -- fs/btrfs/print-tree.c | 10 ++++------ fs/btrfs/relocation.c | 11 ++++++----- include/trace/events/btrfs.h | 1 - include/uapi/linux/btrfs_tree.h | 6 +++++- 8 files changed, 48 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 79336fa853db..b7d54efb4728 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -3373,7 +3373,6 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, struct btrfs_key *node_key, struct btrfs_backref_node *cur) { - struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_backref_edge *edge; struct btrfs_backref_node *exist; int ret; @@ -3462,25 +3461,21 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, ret = handle_direct_tree_backref(cache, &key, cur); if (ret < 0) goto out; - continue; - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto out; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - continue; + } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref + * offset means the root objectid. We need to search + * the tree to get its parent bytenr. + */ + ret = handle_indirect_tree_backref(cache, path, &key, node_key, + cur); + if (ret < 0) + goto out; } - /* - * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset - * means the root objectid. We need to search the tree to get - * its parent bytenr. + * Unrecognized tree backref items (if it can pass tree-checker) + * would be ignored. */ - ret = handle_indirect_tree_backref(cache, path, &key, node_key, - cur); - if (ret < 0) - goto out; } ret = 0; cur->checked = 1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 105f528382e7..f356f08b55cb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -166,8 +166,10 @@ search_again: num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); if (trans) btrfs_abort_transaction(trans, ret); else @@ -603,12 +605,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; } else { - BUG(); + btrfs_err(trans->fs_info, + "unrecognized backref key (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); @@ -639,7 +641,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -860,8 +861,10 @@ again: leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, err); goto out; } @@ -1662,8 +1665,10 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, err); goto out; } @@ -3091,8 +3096,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index e3c9d2706341..7695decc7243 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -256,12 +256,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index deedc1a168e2..1ae6f8e23e07 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -181,8 +181,6 @@ do { \ #define ASSERT(expr) (void)(expr) #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aa06d9ca911d..0c93439e929f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -95,8 +95,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type int ref_index = 0; if (unlikely(item_size < sizeof(*ei))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + btrfs_err(eb->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -291,10 +293,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_num_bytes(l, fi), btrfs_file_extent_ram_bytes(l, fi)); break; - case BTRFS_EXTENT_REF_V0_KEY: - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, -EINVAL, NULL); - break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7408e48d45bc..9951a0caf5bb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3256,12 +3256,13 @@ static int add_tree_block(struct reloc_control *rc, if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - return -EINVAL; } else { - BUG(); + btrfs_print_leaf(eb); + btrfs_err(rc->block_group->fs_info, + "unrecognized tree backref at tree block %llu slot %u", + eb->start, path->slots[0]); + btrfs_release_path(path); + return -EUCLEAN; } btrfs_release_path(path); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a76a279c5f0f..b2db2c2f1c57 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -38,7 +38,6 @@ struct find_free_extent_ctl; __print_symbolic(type, \ { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ - { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index ab38d0f411fa..fc3c32186d7e 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -220,7 +220,11 @@ #define BTRFS_EXTENT_DATA_REF_KEY 178 -#define BTRFS_EXTENT_REF_V0_KEY 180 +/* + * Obsolete key. Defintion removed in 6.6, value may be reused in the future. + * + * #define BTRFS_EXTENT_REF_V0_KEY 180 + */ #define BTRFS_SHARED_BLOCK_REF_KEY 182 -- cgit v1.2.3 From 4844c3664a72d36cc79752cb651c78860b14c240 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 31 Jul 2023 19:16:32 +0800 Subject: btrfs: add a helper to read the superblock metadata_uuid In some cases, we need to read the FSID from the superblock when the metadata_uuid is not set, and otherwise, read the metadata_uuid. So, add a helper. Reviewed-by: Johannes Thumshirn Tested-by: Guilherme G. Piccoli Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bfdfecee1afe..2669206e6f15 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -681,6 +681,14 @@ error_free_page: return -EINVAL; } +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +{ + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; +} + /* * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices * being created with a disk that has already completed its fsid change. Such diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a59898d51e9e..2128a032c3b7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -747,5 +747,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); #endif -- cgit v1.2.3 From 319baafcef2e075f515fce6a89e833f7aeae8484 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 31 Jul 2023 19:16:33 +0800 Subject: btrfs: simplify memcpy either of metadata_uuid or fsid There is a helper which provides either metadata_uuid or fsid as per METADATA_UUID flag. So use it. Reviewed-by: Johannes Thumshirn Tested-by: Guilherme G. Piccoli Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2669206e6f15..733842136163 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -841,15 +841,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - - if (has_metadata_uuid) - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, - BTRFS_FSID_SIZE); - else - memcpy(fs_devices->metadata_uuid, - disk_super->fsid, BTRFS_FSID_SIZE); - + memcpy(fs_devices->metadata_uuid, + btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); fs_devices->fsid_change = false; } } -- cgit v1.2.3 From d167aa76dc0683828588c25767da07fb549e4f48 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 31 Jul 2023 19:16:34 +0800 Subject: btrfs: use the correct superblock to compare fsid in btrfs_validate_super The function btrfs_validate_super() should verify the fsid in the provided superblock argument. Because, all its callers expect it to do that. Such as in the following stack: write_all_supers() sb = fs_info->super_for_commit; btrfs_validate_write_super(.., sb) btrfs_validate_super(.., sb, ..) scrub_one_super() btrfs_validate_super(.., sb, ..) And check_dev_super() btrfs_validate_super(.., sb, ..) However, it currently verifies the fs_info::super_copy::fsid instead, which is not correct. Fix this using the correct fsid in the superblock argument. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Tested-by: Guilherme G. Piccoli Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48faf9c968d0..49f405495e34 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2384,11 +2384,10 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", - fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } -- cgit v1.2.3 From 6bfe3959b0e7a526f5c64747801a8613f002f05a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 31 Jul 2023 19:16:35 +0800 Subject: btrfs: compare the correct fsid/metadata_uuid in btrfs_validate_super The function btrfs_validate_super() should verify the metadata_uuid in the provided superblock argument. Because, all its callers expect it to do that. Such as in the following stacks: write_all_supers() sb = fs_info->super_for_commit; btrfs_validate_write_super(.., sb) btrfs_validate_super(.., sb, ..) scrub_one_super() btrfs_validate_super(.., sb, ..) And check_dev_super() btrfs_validate_super(.., sb, ..) However, it currently verifies the fs_info::super_copy::metadata_uuid instead. Fix this using the correct metadata_uuid in the superblock argument. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Tested-by: Guilherme G. Piccoli Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 49f405495e34..32ec651c570f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2391,13 +2391,11 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, METADATA_UUID) && - memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", - fs_info->super_copy->metadata_uuid, - fs_info->fs_devices->metadata_uuid); + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } -- cgit v1.2.3 From 67bc5ad04bea699371a2d52e89690e54963e1397 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 31 Jul 2023 19:16:36 +0800 Subject: btrfs: drop redundant check to use fs_devices::metadata_uuid fs_devices::metadata_uuid value is already updated based on the super_block::METADATA_UUID flag for either fsid or metadata_uuid as appropriate. So, fs_devices::metadata_uuid can be used directly. Reviewed-by: Johannes Thumshirn Tested-by: Guilherme G. Piccoli Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 32ec651c570f..0a96ea8c1d3a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -313,21 +313,16 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); + /* - * Checking the incompat flag is only valid for the current fs. For - * seed devices it's forbidden to have their uuid changed so reading - * ->fsid in this case is fine + * alloc_fs_devices() copies the fsid into metadata_uuid if the + * metadata_uuid is unset in the superblock, including for a seed device. + * So, we can use fs_devices->metadata_uuid. */ - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; - - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) -- cgit v1.2.3 From 6b604c9a0cf1af7ed5c7b4012cb2fcd9ee988833 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 13 Aug 2023 16:03:28 +0100 Subject: btrfs: remove pointless empty list check when reading delayed dir indexes At btrfs_readdir_delayed_dir_index(), called when reading a directory, we have this check for an empty list to return immediately, but it's not needed since list_for_each_entry_safe(), called immediately after, is prepared to deal with an empty list, it simply does nothing. So remove the empty list check. Besides shorter source code, it also slightly reduces the binary text size: Before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1609408 167269 16864 1793541 1b5e05 fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1609392 167269 16864 1793525 1b5df5 fs/btrfs/btrfs.ko Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6d51db066503..85dcf0024137 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1736,9 +1736,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, int over = 0; unsigned char d_type; - if (list_empty(ins_list)) - return 0; - /* * Changing the data of the delayed item is impossible. So * we needn't lock them. And we have held i_mutex of the -- cgit v1.2.3 From 94628ad9440800ca0bd2436203a27c7576f769b7 Mon Sep 17 00:00:00 2001 From: Lee Trager Date: Thu, 10 Aug 2023 18:44:35 -0700 Subject: btrfs: copy dir permission and time when creating a stub subvolume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit btrfs supports creating nested subvolumes however snapshots are not recursive. When a snapshot is taken of a volume which contains a subvolume the subvolume is replaced with a stub subvolume which has the same name and uses inode number 2[1]. The stub subvolume kept the directory name but did not set the time or permissions of the stub subvolume. This resulted in all time information being the current time and ownership defaulting to root. When subvolumes and snapshots are created using unshare this results in a snapshot directory the user created but has no permissions for. Test case: [vmuser@archvm ~]# sudo -i [root@archvm ~]# mkdir -p /mnt/btrfs/test [root@archvm ~]# chown vmuser:users /mnt/btrfs/test/ [root@archvm ~]# exit logout [vmuser@archvm ~]$ cd /mnt/btrfs/test [vmuser@archvm test]$ unshare --user --keep-caps --map-auto --map-root-user [root@archvm test]# btrfs subvolume create subvolume Create subvolume './subvolume' [root@archvm test]# btrfs subvolume create subvolume/subsubvolume Create subvolume 'subvolume/subsubvolume' [root@archvm test]# btrfs subvolume snapshot subvolume snapshot Create a snapshot of 'subvolume' in './snapshot' [root@archvm test]# exit logout [vmuser@archvm test]$ tree -ug [vmuser users ] . ├── [vmuser users ] snapshot │   └── [vmuser users ] subsubvolume <-- Without patch perm is root:root └── [vmuser users ] subvolume └── [vmuser users ] subsubvolume 5 directories, 0 files [1] https://btrfs.readthedocs.io/en/latest/btrfs-subvolume.html#nested-subvolumes Signed-off-by: Lee Trager Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca0f4781b0e5..e211e88c6545 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5585,11 +5585,11 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root return btrfs_iget_path(s, ino, root, NULL); } -static struct inode *new_simple_dir(struct super_block *s, +static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { - struct inode *inode = new_inode(s); + struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -5608,9 +5608,11 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + inode->i_atime = dir->i_atime; + inode->i_ctime = dir->i_ctime; BTRFS_I(inode)->i_otime = inode->i_mtime; + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; return inode; } @@ -5669,7 +5671,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, root); + inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); -- cgit v1.2.3 From 1dc4888e725dc748b82858984f2a5bd41efc5201 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 14:33:29 +0800 Subject: btrfs: scrub: avoid unnecessary extent tree search preparing stripes Since commit e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure"), scrub no longer re-use the same path for extent tree search. This can lead to unnecessary extent tree search, especially for the new stripe based scrub, as we have way more stripes to prepare. This patch would re-introduce a shared path for extent tree search, and properly release it when the block group is scrubbed. This change alone can improve scrub performance slightly by reducing the time spend preparing the stripe thus improving the queue depth. Before (with regression): Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 15578.00 993616.00 5.00 0.03 0.09 63.78 1.32 100.00 After (with this patch): nvme0n1p3 15875.00 1013328.00 12.00 0.08 0.08 63.83 1.35 100.00 Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8381174bda15..6aabce141d65 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -175,6 +175,7 @@ struct scrub_ctx { struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -339,6 +340,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -1467,6 +1470,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1476,7 +1480,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; @@ -1492,14 +1495,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; - - ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); /* Either error or not found. */ if (ret) goto out; - get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) @@ -1527,7 +1529,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { - ret = find_first_extent_item(extent_root, &path, cur_logical, + ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; @@ -1535,7 +1537,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = 0; break; } - get_extent_info(&path, &extent_start, &extent_len, + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; @@ -1575,7 +1577,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: - btrfs_release_path(&path); return ret; } @@ -1765,8 +1766,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* We can queue one stripe using the remaining slot. */ scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, - logical, length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, dev, + physical, mirror_num, logical, + length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; @@ -1784,6 +1786,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1794,6 +1797,14 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + /* + * For data stripe search, we cannot re-use the same extent path, as + * the data stripe bytenr may be smaller than previous extent. Thus we + * have to use our own extent path. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; @@ -1808,7 +1819,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, + ret = scrub_find_fill_first_stripe(bg, &extent_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1936,6 +1947,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); + btrfs_release_path(&extent_path); out: return ret; } @@ -2103,6 +2115,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 stripe_logical; int stop_loop = 0; + /* Extent_path should be released by now. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && @@ -2221,6 +2236,8 @@ out: ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; + btrfs_release_path(&sctx->extent_path); + if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); -- cgit v1.2.3 From 3c771c194402ffe20d4de68d9fc21e703179a9ce Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 14:33:30 +0800 Subject: btrfs: scrub: avoid unnecessary csum tree search preparing stripes One of the bottleneck of the new scrub code is the extra csum tree search. The old code would only do the csum tree search for each scrub bio, which can be as large as 512KiB, thus they can afford to allocate a new path each time. But the new scrub code is doing csum tree search for each stripe, which is only 64KiB, this means we'd better re-use the same csum path during each search. This patch would introduce a per-sctx path for csum tree search, as we don't need to re-allocate the path every time we need to do a csum tree search. With this change we can further improve the queue depth and improve the scrub read performance: Before (with regression and cached extent tree path): Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 15875.00 1013328.00 12.00 0.08 0.08 63.83 1.35 100.00 After (with both cached extent/csum tree path): nvme0n1p3 17759.00 1133280.00 10.00 0.06 0.08 63.81 1.50 100.00 Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 34 ++++++++++++++++++++++------------ fs/btrfs/file-item.h | 6 +++--- fs/btrfs/raid56.c | 4 ++-- fs/btrfs/scrub.c | 29 +++++++++++++++++++---------- 4 files changed, 46 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 696bf695d8eb..1ce5dd154499 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -597,29 +597,37 @@ fail: * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit) +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; + bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && + key.offset <= start) + goto search_forward; + btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -656,6 +664,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } } +search_forward: while (start <= end) { u64 csum_end; @@ -712,7 +721,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } ret = 0; fail: - btrfs_free_path(path); + if (free_path) + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 4ec669b69008..04bd2d34efb1 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f97fc62fbab2..3e014b9370a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2105,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap, false); + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6aabce141d65..d04fd3c6e927 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -176,6 +176,7 @@ struct scrub_ctx { struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; struct btrfs_path extent_path; + struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -342,6 +343,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->fs_info = fs_info; sctx->extent_path.search_commit_root = 1; sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -1471,6 +1474,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_path *extent_path, + struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1562,9 +1566,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, - stripe_end, stripe->csums, - &csum_bitmap, true); + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) @@ -1766,9 +1770,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* We can queue one stripe using the remaining slot. */ scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, dev, - physical, mirror_num, logical, - length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, + mirror_num, logical, length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; @@ -1787,6 +1791,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1798,12 +1803,14 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent path, as - * the data stripe bytenr may be smaller than previous extent. Thus we - * have to use our own extent path. + * For data stripe search, we cannot re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. Thus + * we have to use our own extent/csum paths. */ extent_path.search_commit_root = 1; extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; for (int i = 0; i < data_stripes; i++) { int stripe_index; @@ -1819,7 +1826,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, &extent_path, + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1948,6 +1955,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); out: return ret; } @@ -2237,6 +2245,7 @@ out: if (!ret) ret = ret2; btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) -- cgit v1.2.3 From ae76d8e3e1351aa1ba09cc68dab6866d356f2e17 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 14:33:31 +0800 Subject: btrfs: scrub: fix grouping of read IO [REGRESSION] There are several regression reports about the scrub performance with v6.4 kernel. On a PCIe 3.0 device, the old v6.3 kernel can go 3GB/s scrub speed, but v6.4 can only go 1GB/s, an obvious 66% performance drop. [CAUSE] Iostat shows a very different behavior between v6.3 and v6.4 kernel: Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 9731.00 3425544.00 17237.00 63.92 2.18 352.02 21.18 100.00 nvme0n1p3 15578.00 993616.00 5.00 0.03 0.09 63.78 1.32 100.00 The upper one is v6.3 while the lower one is v6.4. There are several obvious differences: - Very few read merges This turns out to be a behavior change that we no longer do bio plug/unplug. - Very low aqu-sz This is due to the submit-and-wait behavior of flush_scrub_stripes(), and extra extent/csum tree search. Both behaviors are not that obvious on SATA SSDs, as SATA SSDs have NCQ to merge the reads, while SATA SSDs can not handle high queue depth well either. [FIX] For now this patch focuses on the read speed fix. Dev-replace replace speed needs more work. For the read part, we go two directions to fix the problems: - Re-introduce blk plug/unplug to merge read requests This is pretty simple, and the behavior is pretty easy to observe. This would enlarge the average read request size to 512K. - Introduce multi-group reads and no longer wait for each group Instead of the old behavior, which submits 8 stripes and waits for them, here we would enlarge the total number of stripes to 16 * 8. Which is 8M per device, the same limit as the old scrub in-flight bios size limit. Now every time we fill a group (8 stripes), we submit them and continue to next stripes. Only when the full 16 * 8 stripes are all filled, we submit the remaining ones (the last group), and wait for all groups to finish. Then submit the repair writes and dev-replace writes. This should enlarge the queue depth. This would greatly improve the merge rate (thus read block size) and queue depth: Before (with regression, and cached extent/csum path): Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 20666.00 1318240.00 10.00 0.05 0.08 63.79 1.63 100.00 After (with all patches applied): nvme0n1p3 5165.00 2278304.00 30557.00 85.54 0.55 441.10 2.81 100.00 i.e. 1287 to 2224 MB/s. CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 96 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d04fd3c6e927..11f943e86894 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -43,9 +43,20 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This determines the batch size for stripe submitted in one go. + * This detemines how many stripes would be submitted in one go, + * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ -#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ +#define SCRUB_STRIPES_PER_GROUP 8 + +/* + * How many groups we have for each sctx. + * + * This would be 8M per device, the same value as the old scrub in-flight bios + * size limit. + */ +#define SCRUB_GROUPS_PER_SCTX 16 + +#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -172,7 +183,7 @@ struct scrub_stripe { }; struct scrub_ctx { - struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; struct btrfs_path extent_path; @@ -317,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) release_scrub_stripe(&sctx->stripes[i]); - kfree(sctx); + kvfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) @@ -335,7 +346,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( struct scrub_ctx *sctx; int i; - sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); + /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use + * kvzalloc(). + */ + sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; refcount_set(&sctx->refs, 1); @@ -345,7 +359,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->extent_path.skip_locking = 1; sctx->csum_path.search_commit_root = 1; sctx->csum_path.skip_locking = 1; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); @@ -1658,6 +1672,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) return false; } +static void submit_initial_group_read(struct scrub_ctx *sctx, + unsigned int first_slot, + unsigned int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot < SCRUB_TOTAL_STRIPES); + ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); +} + static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -1670,11 +1706,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset(nr_stripes)); - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); + + submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); } for (int i = 0; i < nr_stripes; i++) { @@ -1754,21 +1790,19 @@ static void raid56_scrub_wait_endio(struct bio *bio) static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *dev, int mirror_num, - u64 logical, u32 length, u64 physical) + u64 logical, u32 length, u64 physical, + u64 *found_logical_ret) { struct scrub_stripe *stripe; int ret; - /* No available slot, submit all stripes and wait for them. */ - if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { - ret = flush_scrub_stripes(sctx); - if (ret < 0) - return ret; - } + /* + * There should always be one slot left, as caller filling the last + * slot should flush them all. + */ + ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); stripe = &sctx->stripes[sctx->cur_stripe]; - - /* We can queue one stripe using the remaining slot. */ scrub_reset_stripe(stripe); ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, &sctx->csum_path, dev, physical, @@ -1776,7 +1810,20 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; + if (found_logical_ret) + *found_logical_ret = stripe->logical; sctx->cur_stripe++; + + /* We filled one group, submit it. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; + + submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); + } + + /* Last slot used, flush them all. */ + if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) + return flush_scrub_stripes(sctx); return 0; } @@ -1985,6 +2032,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { + u64 found_logical; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -2009,7 +2057,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, ret = queue_scrub_stripe(sctx, bg, device, mirror_num, cur_logical, logical_end - cur_logical, - cur_physical); + cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -2019,9 +2067,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; - ASSERT(sctx->cur_stripe > 0); - cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical - + BTRFS_STRIPE_LEN; + cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ cond_resched(); -- cgit v1.2.3 From 39dc7bd94d084dcdcd4c88be281d7d8c9b3b0d61 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 14:33:32 +0800 Subject: btrfs: scrub: don't go ordered workqueue for dev-replace The workqueue fs_info->scrub_worker would go ordered workqueue if it's a device replace operation. However the scrub is relying on multiple workers to do data csum verification, and we always submit several read requests in a row. Thus there is no need to use ordered workqueue just for dev-replace. We have extra synchronization (the main thread will always submit-and-wait for dev-replace writes) to handle it for zoned devices. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 11f943e86894..ff2d8b89057a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2777,8 +2777,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, - int is_dev_replace) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; @@ -2788,10 +2787,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - if (is_dev_replace) - scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); - else - scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; @@ -2843,7 +2839,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); - ret = scrub_workers_get(fs_info, is_dev_replace); + ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; -- cgit v1.2.3 From 4fe44f9d0472579f69b44001787f2316e4541217 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 14:33:33 +0800 Subject: btrfs: scrub: move write back of repaired sectors to scrub_stripe_read_repair_worker() Currently the scrub_stripe_read_repair_worker() only does reads to rebuild the corrupted sectors, it doesn't do any writeback. The design is mostly to put writeback into a more ordered manner, to co-operate with dev-replace with zoned mode, which requires every write to be submitted in their bytenr order. However the writeback for repaired sectors into the original mirror doesn't need such strong sync requirement, as it can only happen for non-zoned devices. This patch would move the writeback for repaired sectors into scrub_stripe_read_repair_worker(), which removes two calls sites for repaired sectors writeback. (one from flush_scrub_stripes(), one from scrub_raid56_parity_stripe()) Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 72 ++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ff2d8b89057a..b877203f1dc5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -990,6 +990,9 @@ skip: spin_unlock(&sctx->stat_lock); } +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); + /* * The main entrance for all read related scrub work, including: * @@ -998,13 +1001,16 @@ skip: * - Go through the remaining mirrors and try to read as large blocksize as * possible * - Go through all mirrors (including the failed mirror) sector-by-sector + * - Submit writeback for repaired sectors * - * Writeback does not happen here, it needs extra synchronization. + * Writeback for dev-replace does not happen here, it needs extra + * synchronization for zoned devices. */ static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_ctx *sctx = stripe->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); int mirror; @@ -1069,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) goto out; } out: - scrub_stripe_report_errors(stripe->sctx, stripe); + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1720,32 +1742,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } - /* - * Submit the repaired sectors. For zoned case, we cannot do repair - * in-place, but queue the bg to be relocated. - */ - if (btrfs_is_zoned(fs_info)) { - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - - if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { - btrfs_repair_one_zone(fs_info, - sctx->stripes[0].bg->start); - break; - } - } - } else if (!sctx->readonly) { - for (int i = 0; i < nr_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - } - /* Submit for dev-replace. */ if (sctx->is_dev_replace) { /* @@ -1918,24 +1914,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* For now, no zoned support for RAID56. */ ASSERT(!btrfs_is_zoned(sctx->fs_info)); - /* Writeback for the repaired sectors. */ - for (int i = 0; i < data_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->raid56_data_stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - - /* Wait for the above writebacks to finish. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - wait_scrub_stripe_io(stripe); - } - /* * Now all data stripes are properly verified. Check if we have any * unrepaired, if so abort immediately or we could further corrupt the -- cgit v1.2.3 From 89c3760428db0342edf2747240419c9b34db3485 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Aug 2023 16:57:31 -0400 Subject: btrfs: tests: add extent_map tests for dropping with odd layouts While investigating weird problems with the extent_map I wrote a self test testing the various edge cases of btrfs_drop_extent_map_range. This can split in different ways and behaves different in each case, so test the various edge cases to make sure everything is functioning properly. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 218 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ed0f36ae5346..cf85c7795686 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,7 @@ #include #include "btrfs-tests.h" #include "../ctree.h" +#include "../btrfs_inode.h" #include "../volumes.h" #include "../disk-io.h" #include "../block-group.h" @@ -442,6 +443,218 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +static int add_compressed_extent(struct extent_map_tree *em_tree, + u64 start, u64 len, u64 block_start) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = start; + em->len = len; + em->block_start = block_start; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("cannot add extent map [%llu, %llu)", start, start + len); + return ret; + } + + return 0; +} + +struct extent_range { + u64 start; + u64 len; +}; + +/* The valid states of the tree after every drop, as described below. */ +struct extent_range valid_ranges[][7] = { + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K}, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + } +}; + +static int validate_range(struct extent_map_tree *em_tree, int index) +{ + struct rb_node *n; + int i; + + for (i = 0, n = rb_first_cached(&em_tree->map); + valid_ranges[index][i].len && n; + i++, n = rb_next(n)) { + struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); + + if (entry->start != valid_ranges[index][i].start) { + test_err("mapping has start %llu expected %llu", + entry->start, valid_ranges[index][i].start); + return -EINVAL; + } + + if (entry->len != valid_ranges[index][i].len) { + test_err("mapping has len %llu expected %llu", + entry->len, valid_ranges[index][i].len); + return -EINVAL; + } + } + + /* + * We exited because we don't have any more entries in the extent_map + * but we still expect more valid entries. + */ + if (valid_ranges[index][i].len) { + test_err("missing an entry"); + return -EINVAL; + } + + /* We exited the loop but still have entries in the extent map. */ + if (n) { + test_err("we have a left over entry in the extent map we didn't expect"); + return -EINVAL; + } + + return 0; +} + +/* + * Test scenario: + * + * Test the various edge cases of btrfs_drop_extent_map_range, create the + * following ranges + * + * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k) + * + * And then we'll drop: + * + * [8k, 12k) - test the single front split + * [12k, 20k) - test the single back split + * [28k, 32k) - test the double split + * [32k, 64k) - test whole em dropping + * + * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from + * merging the em's. + */ +static int test_case_5(void) +{ + struct extent_map_tree *em_tree; + struct inode *inode; + u64 start, end; + int ret; + + test_msg("Running btrfs_drop_extent_map_range tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + /* [0, 12k) */ + ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + if (ret) { + test_err("cannot add extent range [0, 12K)"); + goto out; + } + + /* [12k, 24k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [24k, 36k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [36k, 40k) */ + ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [40k, 64k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* Drop [8k, 12k) */ + start = SZ_8K; + end = (3 * SZ_4K) - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + if (ret) + goto out; + + /* Drop [12k, 20k) */ + start = SZ_4K * 3; + end = SZ_16K + SZ_4K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + if (ret) + goto out; + + /* Drop [28k, 32k) */ + start = SZ_32K - SZ_4K; + end = SZ_32K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + if (ret) + goto out; + + /* Drop [32k, 64k) */ + start = SZ_32K; + end = SZ_64K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + if (ret) + goto out; +out: + iput(inode); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -619,6 +832,11 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_4(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_5(); + if (ret) + goto out; test_msg("running rmap tests"); for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { -- cgit v1.2.3 From f345dbdf2c9c4733d52b9389da6c02672f3b8672 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Aug 2023 16:57:32 -0400 Subject: btrfs: tests: add a test for btrfs_add_extent_mapping This helper is different from the normal add_extent_mapping in that it will stuff an em into a gap that exists between overlapping em's in the tree. It appeared there was a bug so I wrote a self test to validate it did the correct thing when it worked with two side by side ems. Thankfully it is correct, but more testing is better. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index cf85c7795686..31de4cd0666f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -655,6 +655,59 @@ out: return ret; } +/* + * Test the btrfs_add_extent_mapping helper which will attempt to create an em + * for areas between two existing ems. Validate it doesn't do this when there + * are two unmerged em's side by side. + */ +static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +{ + struct extent_map *em = NULL; + int ret; + + ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + if (ret) + goto out; + + ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + if (ret) + goto out; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_16K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + write_unlock(&em_tree->lock); + + if (ret != 0) { + test_err("got an error when adding our em: %d", ret); + goto out; + } + + ret = -EINVAL; + if (em->start != 0) { + test_err("unexpected em->start at %llu, wanted 0", em->start); + goto out; + } + if (em->len != SZ_4K) { + test_err("unexpected em->len %llu, expected 4K", em->len); + goto out; + } + ret = 0; +out: + free_extent_map(em); + free_extent_map_tree(em_tree); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -835,6 +888,9 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_5(); + if (ret) + goto out; + ret = test_case_6(fs_info, em_tree); if (ret) goto out; -- cgit v1.2.3 From 92e1229b204d66525ee95054188938142420324d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Aug 2023 16:57:33 -0400 Subject: btrfs: tests: test invalid splitting when skipping pinned drop extent_map This reproduces the bug fixed by "btrfs: fix incorrect splitting in btrfs_drop_extent_map_range", we were improperly calculating the range for the split extent. Add a test that exercises this scenario and validates that we get the correct resulting extent_maps in our tree. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 138 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 31de4cd0666f..29bdd08b241f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -708,6 +708,141 @@ out: return ret; } +/* + * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned == + * true would mess up the start/end calculations and subsequent splits would be + * incorrect. + */ +static int test_case_7(void) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct inode *inode; + int ret; + + test_msg("Running btrfs_drop_extent_cache with pinned"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [0, 16K), pinned */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [32K, 48K), not pinned */ + em->start = SZ_32K; + em->len = SZ_16K; + em->block_start = SZ_32K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + /* + * Drop [0, 36K) This should skip the [0, 4K) extent and then split the + * [32K, 48K) extent. + */ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + + /* Make sure our extent maps look sane. */ + ret = -EINVAL; + + em = lookup_extent_mapping(em_tree, 0, SZ_16K); + if (!em) { + test_err("didn't find an em at 0 as expected"); + goto out; + } + + if (em->start != 0) { + test_err("em->start is %llu, expected 0", em->start); + goto out; + } + + if (em->len != SZ_16K) { + test_err("em->len is %llu, expected 16K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an em when we weren't expecting one"); + goto out; + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + read_unlock(&em_tree->lock); + if (!em) { + test_err("didn't find an em at 32K as expected"); + goto out; + } + + if (em->start != (36 * SZ_1K)) { + test_err("em->start is %llu, expected 36K", em->start); + goto out; + } + + if (em->len != (12 * SZ_1K)) { + test_err("em->len is %llu, expected 12K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an unexpected em above 48K"); + goto out; + } + + ret = 0; +out: + free_extent_map(em); + iput(inode); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -891,6 +1026,9 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_6(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_7(); if (ret) goto out; -- cgit v1.2.3 From c02d35d89b317994bd713ba82e160c5e7f22d9c8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 19 Aug 2023 01:26:07 +0900 Subject: btrfs: zoned: skip splitting and logical rewriting on pre-alloc write When doing a relocation, there is a chance that at the time of btrfs_reloc_clone_csums(), there is no checksum for the corresponding region. In this case, btrfs_finish_ordered_zoned()'s sum points to an invalid item and so ordered_extent's logical is set to some invalid value. Then, btrfs_lookup_block_group() in btrfs_zone_finish_endio() failed to find a block group and will hit an assert or a null pointer dereference as following. This can be reprodcued by running btrfs/028 several times (e.g, 4 to 16 times) with a null_blk setup. The device's zone size and capacity is set to 32 MB and the storage size is set to 5 GB on my setup. KASAN: null-ptr-deref in range [0x0000000000000088-0x000000000000008f] CPU: 6 PID: 3105720 Comm: kworker/u16:13 Tainted: G W 6.5.0-rc6-kts+ #1 Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0 12/17/2015 Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] RIP: 0010:btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] Code: 41 54 49 89 fc 55 48 89 f5 53 e8 57 7d fc ff 48 8d b8 88 00 00 00 48 89 c3 48 b8 00 00 00 00 00 > 3c 02 00 0f 85 02 01 00 00 f6 83 88 00 00 00 01 0f 84 a8 00 00 RSP: 0018:ffff88833cf87b08 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000011 RSI: 0000000000000004 RDI: 0000000000000088 RBP: 0000000000000002 R08: 0000000000000001 R09: ffffed102877b827 R10: ffff888143bdc13b R11: ffff888125b1cbc0 R12: ffff888143bdc000 R13: 0000000000007000 R14: ffff888125b1cba8 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88881e500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3ed85223d5 CR3: 00000001519b4005 CR4: 00000000001706e0 Call Trace: ? die_addr+0x3c/0xa0 ? exc_general_protection+0x148/0x220 ? asm_exc_general_protection+0x22/0x30 ? btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] ? btrfs_zone_finish_endio.part.0+0x19/0x160 [btrfs] btrfs_finish_one_ordered+0x7b8/0x1de0 [btrfs] ? rcu_is_watching+0x11/0xb0 ? lock_release+0x47a/0x620 ? btrfs_finish_ordered_zoned+0x59b/0x800 [btrfs] ? __pfx_btrfs_finish_one_ordered+0x10/0x10 [btrfs] ? btrfs_finish_ordered_zoned+0x358/0x800 [btrfs] ? __smp_call_single_queue+0x124/0x350 ? rcu_is_watching+0x11/0xb0 btrfs_work_helper+0x19f/0xc60 [btrfs] ? __pfx_try_to_wake_up+0x10/0x10 ? _raw_spin_unlock_irq+0x24/0x50 ? rcu_is_watching+0x11/0xb0 process_one_work+0x8c1/0x1430 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_process_one_work+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? _raw_spin_lock_irq+0x52/0x60 worker_thread+0x100/0x12c0 ? __kthread_parkme+0xc1/0x1f0 ? __pfx_worker_thread+0x10/0x10 kthread+0x2ea/0x3c0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 On the zoned mode, writing to pre-allocated region means data relocation write. Such write always uses WRITE command so there is no need of splitting and rewriting logical address. Thus, we can just skip the function for the case. Fixes: cbfce4c7fbde ("btrfs: optimize the logical to physical mapping for zoned writes") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a2f8e7440f8c..09bc325d075d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1700,10 +1700,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *sum = - list_first_entry(&ordered->list, typeof(*sum), list); - u64 logical = sum->logical; - u64 len = sum->len; + struct btrfs_ordered_sum *sum; + u64 logical, len; + + /* + * Write to pre-allocated region is for the data relocation, and so + * it should use WRITE operation. No split/rewrite are necessary. + */ + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + return; + + ASSERT(!list_empty(&ordered->list)); + /* The ordered->list can be empty in the above pre-alloc case. */ + sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + logical = sum->logical; + len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); -- cgit v1.2.3