From 089c8b0551f46e1c44269c10b0576e031e942acd Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 7 Oct 2020 15:20:03 +0800 Subject: btrfs: sysfs: export filesystem generation Matching with the information that's available from the ioctl FS_INFO, add generation to the per-filesystem directory /sys/fs/btrfs/UUID/generation, which could be used by scripts. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 279d9262b676..8424f5d0e5ed 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -854,6 +854,15 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); +static ssize_t btrfs_generation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); +} +BTRFS_ATTR(, generation, btrfs_generation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -863,6 +872,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), + BTRFS_ATTR_PTR(, generation), NULL, }; -- cgit v1.2.3 From ba1bc00f358190ae1011eae82766aba5c73c9ca2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:27 +0300 Subject: btrfs: use helpers to convert from seconds to jiffies in transaction_kthread The kernel provides easy to understand helpers to convert from human understandable units to the kernel-friendly 'jiffies'. So let's use those to make the code easier to understand. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index af97ddcc6b3e..d052c2051632 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1780,7 +1780,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = HZ * fs_info->commit_interval; + delay = msecs_to_jiffies(fs_info->commit_interval * 1000); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1795,7 +1795,7 @@ static int transaction_kthread(void *arg) (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); - delay = HZ * 5; + delay = msecs_to_jiffies(5000); goto sleep; } transid = cur->transid; -- cgit v1.2.3 From e4e428816192798c2fa473ff67d9032b94f93291 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:28 +0300 Subject: btrfs: remove redundant time check in transaction kthread loop The value obtained from ktime_get_seconds() is guaranteed to be monotonically increasing since it's taken from CLOCK_MONOTONIC. As transaction_kthread obtains a reference to the currently running transaction under holding btrfs_fs_info::trans_lock it's guaranteed to: a) see an initialized 'cur', whose start_time is guaranteed to be smaller than 'now' or b) not obtain a 'cur' and simply go to sleep. Given this remove the unnecessary check, if it sees now < cur->start_time this would imply there are far greater problems on the machine. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d052c2051632..3f2c5c05359e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1792,8 +1792,7 @@ static int transaction_kthread(void *arg) now = ktime_get_seconds(); if (cur->state < TRANS_STATE_COMMIT_START && - (now < cur->start_time || - now - cur->start_time < fs_info->commit_interval)) { + now - cur->start_time < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay = msecs_to_jiffies(5000); goto sleep; -- cgit v1.2.3 From 643900bee4141e1b1c47dce93973a0d1a314d8a5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:29 +0300 Subject: btrfs: record delta directly in transaction_kthread Rename 'now' to 'delta' and store there the delta between transaction start time and current time. This is in preparation for optimising the sleep logic in the next patch. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f2c5c05359e..813d1d5e6927 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1774,7 +1774,7 @@ static int transaction_kthread(void *arg) struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; - time64_t now; + time64_t delta; unsigned long delay; bool cannot_commit; @@ -1790,9 +1790,9 @@ static int transaction_kthread(void *arg) goto sleep; } - now = ktime_get_seconds(); + delta = ktime_get_seconds() - cur->start_time; if (cur->state < TRANS_STATE_COMMIT_START && - now - cur->start_time < fs_info->commit_interval) { + delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay = msecs_to_jiffies(5000); goto sleep; -- cgit v1.2.3 From fb8a7e941b1b4c1c2fa79b305d4c3fc41ad9bbda Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 20 Oct 2020 12:44:17 +0300 Subject: btrfs: calculate more accurate remaining time to sleep in transaction_kthread If transaction_kthread is woken up before btrfs_fs_info::commit_interval seconds have elapsed it will sleep for a fixed period of 5 seconds. This is not a problem per-se but is not accurate. Instead the code should sleep for an interval which guarantees on next wakeup commit_interval would have passed. Since time tracking is not precise subtract 1 second from delta to ensure the delay we end up waiting will be longer than than the wake up period. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 813d1d5e6927..e3caf18dcff4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1794,7 +1794,9 @@ static int transaction_kthread(void *arg) if (cur->state < TRANS_STATE_COMMIT_START && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay = msecs_to_jiffies(5000); + delay -= msecs_to_jiffies((delta - 1) * 1000); + delay = min(delay, + msecs_to_jiffies(fs_info->commit_interval * 1000)); goto sleep; } transid = cur->transid; -- cgit v1.2.3 From eefa45f593792827771cfd845ab12fea5a7c7cd9 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 25 Sep 2020 15:36:38 -0500 Subject: btrfs: calculate num_pages, reserve_bytes once in btrfs_buffered_write write_bytes can change in btrfs_check_nocow_lock(). Calculate variables such as num_pages and reserve_bytes once we are sure of the value of write_bytes so there is no need to re-calculate. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4373da7bcc0d..7c5e3d406574 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,8 +1591,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - offset); - size_t num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); + size_t num_pages; size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1600,8 +1599,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t num_sectors; int extents_locked; - WARN_ON(num_pages > nrptrs); - /* * Fault pages before locking them in prepare_pages * to avoid recursive lock @@ -1613,35 +1610,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, only_release_metadata = false; sector_offset = pos & (fs_info->sectorsize - 1); - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, write_bytes); if (ret < 0) { + /* + * If we don't have to COW at the offset, reserve + * metadata only. write_bytes may get smaller than + * requested here. + */ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ + &write_bytes) > 0) only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + - sector_offset, - fs_info->sectorsize); - } else { + else break; - } } + num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); + WARN_ON(num_pages > nrptrs); + reserve_bytes = round_up(write_bytes + sector_offset, + fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); -- cgit v1.2.3 From 949b32732eab33018283e0517cc528be10a3d085 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 15 Sep 2020 10:41:40 -0500 Subject: btrfs: use iosize while reading compressed pages While using compression, a submitted bio is mapped with a compressed bio which performs the read from disk, decompresses and returns uncompressed data to original bio. The original bio must reflect the uncompressed size (iosize) of the I/O to be performed, or else the page just gets the decompressed I/O length of data (disk_io_size). The compressed bio checks the extent map and gets the correct length while performing the I/O from disk. This came up in subpage work when only compressed length of the original bio was filled in the page. This worked correctly for pagesize == sectorsize because both compressed and uncompressed data are at pagesize boundaries, and would end up filling the requested page. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60f5f68d892d..14d01b76f5c9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3158,7 +3158,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int nr = 0; size_t pg_offset = 0; size_t iosize; - size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -3224,13 +3223,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) { - disk_io_size = em->block_len; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) offset = em->block_start; - } else { + else offset = em->block_start + extent_offset; - disk_io_size = iosize; - } block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3319,7 +3315,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, disk_io_size, + page, offset, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, -- cgit v1.2.3 From 13f0dd8f786152404fa5bc1f9436aad83556a311 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 14 Oct 2020 09:55:44 -0500 Subject: btrfs: use round_down while calculating start position in btrfs_dirty_pages() round_down looks prettier than the bit mask operations. Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c5e3d406574..c7ed30edf1cd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -474,7 +474,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; - start_pos = pos & ~((u64) fs_info->sectorsize - 1); + start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); -- cgit v1.2.3 From aa8c1a41a1e6108aa65c359efe90711337d03a16 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 14 Oct 2020 09:55:45 -0500 Subject: btrfs: set EXTENT_NORESERVE bits side btrfs_dirty_pages() Set the extent bits EXTENT_NORESERVE inside btrfs_dirty_pages() as opposed to calling set_extent_bits again later. Fold check for written length within the function. Note: EXTENT_NORESERVE is set before unlocking extents. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 26 ++++++++++---------------- fs/btrfs/free-space-cache.c | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b29bdb25105..0738ec94d806 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3112,7 +3112,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached); + struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c7ed30edf1cd..5e2b0592abea 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -462,7 +462,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached) + struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int err = 0; @@ -474,6 +474,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; + if (write_bytes == 0) + return 0; + + if (noreserve) + extra_bits |= EXTENT_NORESERVE; + start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); @@ -1720,10 +1726,9 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - if (copied > 0) - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, - &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's @@ -1748,17 +1753,6 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - if (only_release_metadata && copied > 0) { - lockstart = round_down(pos, - fs_info->sectorsize); - lockend = round_up(pos + copied, - fs_info->sectorsize) - 1; - - set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_NORESERVE, NULL, - NULL, GFP_NOFS); - } - btrfs_drop_pages(pages, num_pages); cond_resched(); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index af0013d3df63..5ea36a06e514 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1332,7 +1332,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Everything is written out, now we dirty the pages in the file. */ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), - &cached_state); + &cached_state, false); if (ret) goto out_nospc; -- cgit v1.2.3 From a57ad681f12e1ec80365fc4693e12e979159b9d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Oct 2020 11:55:25 +0100 Subject: btrfs: assert we are holding the reada_lock when releasing a readahead zone When we drop the last reference of a zone, we end up releasing it through the callback reada_zone_release(), which deletes the zone from a device's reada_zones radix tree. This tree is protected by the global readahead lock at fs_info->reada_lock. Currently all places that are sure that they are dropping the last reference on a zone, are calling kref_put() in a critical section delimited by this lock, while all other places that are sure they are not dropping the last reference, do not bother calling kref_put() while holding that lock. When working on the previous fix for hangs and use-after-frees in the readahead code, my initial attempts were different and I actually ended up having reada_zone_release() called when not holding the lock, which resulted in weird and unexpected problems. So just add an assertion there to detect such problem more quickly and make the dependency more obvious. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reada.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index d9a166eb344e..6e33cb755fa5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -531,6 +531,8 @@ static void reada_zone_release(struct kref *kref) { struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + lockdep_assert_held(&zone->device->fs_info->reada_lock); + radix_tree_delete(&zone->device->reada_zones, zone->end >> PAGE_SHIFT); -- cgit v1.2.3 From a6889caf6ec6ec32f19a02a9118410f39fc84fe2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Oct 2020 11:55:26 +0100 Subject: btrfs: do not start readahead for csum tree when scrubbing non-data block groups When scrubbing a stripe of a block group we always start readahead for the checksums btree and wait for it to complete, however when the blockgroup is not a data block group (or a mixed block group) it is a waste of time to do it, since there are no checksums for metadata extents in that btree. So skip that when the block group does not have the data flag set, saving some time doing memory allocations, queueing a job in the readahead work queue, waiting for it to complete and potentially avoiding some IO as well (when csum tree extents are not in memory already). Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e71e7586e9eb..04351b55dd14 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3084,17 +3084,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key, &key_end); - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); + if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = logic_end; + reada2 = btrfs_reada_add(csum_root, &key, &key_end); + } else { + reada2 = NULL; + } if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); - if (!IS_ERR(reada2)) + if (!IS_ERR_OR_NULL(reada2)) btrfs_reada_wait(reada2); -- cgit v1.2.3 From d70bf7484f728704454c0566814458bf6d6c7cf3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:13 -0400 Subject: btrfs: unify the ro checking for mount options We're going to be adding more options that require RDONLY, so add a helper to do the check and error out if we don't have RDONLY set. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8840a4fa81eb..f99e89ec46b2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -458,6 +458,17 @@ static const match_table_t rescue_tokens = { {Opt_err, NULL}, }; +static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, + const char *opt_name) +{ + if (fs_info->mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -968,14 +979,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } } check: - /* - * Extra check for current option against current flag - */ - if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) { - btrfs_err(info, - "nologreplay must be used with ro mount option"); + /* We're read-only, don't have to check. */ + if (new_flags & SB_RDONLY) + goto out; + + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay")) ret = -EINVAL; - } out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, FREE_SPACE_TREE) && -- cgit v1.2.3 From 334c16d82cfe180f7b262a6f8ae2d9379f032b18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:14 -0400 Subject: btrfs: push the NODATASUM check into btrfs_lookup_bio_sums When we move to being able to handle NULL csum_roots it'll be cleaner to just check in btrfs_lookup_bio_sums instead of at all of the caller locations, so push the NODATASUM check into it as well so it's unified. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 +++++--------- fs/btrfs/file-item.c | 3 +++ fs/btrfs/inode.c | 12 +++++++++--- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index eeface30facd..7e1eb57b923c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -722,11 +722,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ refcount_inc(&cb->pending_bios); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, - (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, + sums); + BUG_ON(ret); /* -ENOMEM */ nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); @@ -751,10 +749,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8f4f2bd6d9b9..8083d71d6af6 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -272,6 +272,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return BLK_STS_OK; + path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7e8d8169779d..c8cda5df9fb0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2202,7 +2202,12 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, mirror_num, bio_flags); goto out; - } else if (!skip_sum) { + } else { + /* + * Lookup bio sums does extra checks around whether we + * need to csum or not, which is why we ignore skip_sum + * here. + */ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; @@ -7836,7 +7841,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); @@ -7863,10 +7867,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, return BLK_QC_T_NONE; } - if (!write && csum) { + if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. + * + * If we have csums disabled this will do nothing. */ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, dip->csums); -- cgit v1.2.3 From ceafe3cc39923fc80b9091f3fd993e6de1b6a399 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:15 -0400 Subject: btrfs: sysfs: export supported rescue= mount options We're going to be adding a variety of different rescue options, we should advertise which ones we support to make user spaces life easier in the future. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8424f5d0e5ed..8f0462d6855d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -329,10 +329,32 @@ static ssize_t send_stream_version_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); +static const char *rescue_opts[] = { + "usebackuproot", + "nologreplay", +}; + +static ssize_t supported_rescue_options_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (i ? " " : ""), rescue_opts[i]); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_rescue_options, + supported_rescue_options_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), + BTRFS_ATTR_PTR(static_feature, supported_rescue_options), NULL }; -- cgit v1.2.3 From ab0b4a3ebf145f54dc23b66a411f4bbbc59b0d96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:16 -0400 Subject: btrfs: add a helper to print out rescue= options We're going to have a lot of rescue options, add a helper to collapse the /proc/mounts output to rescue=option1:option2:option3 format. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f99e89ec46b2..c5095a23befd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1392,11 +1392,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return btrfs_commit_transaction(trans); } +static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) +{ + seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); + *printed = true; +} + static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); const char *compress_type; const char *subvol_name; + bool printed = false; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1429,7 +1436,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) - seq_puts(seq, ",rescue=nologreplay"); + print_rescue_option(seq, "nologreplay", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) -- cgit v1.2.3 From 68319c18cb21ab472ce2c4ed572257a42455ac01 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:17 -0400 Subject: btrfs: show rescue=usebackuproot in /proc/mounts The standalone option usebackuproot was intended as one-time use and it was not necessary to keep it in the option list. Now that we're going to have more rescue options, it's desirable to keep them intact as it could be confusing why the option disappears. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ remove the btrfs_clear_opt part from open_ctree ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/super.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e3caf18dcff4..137c4d5eaa8d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3384,12 +3384,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } set_bit(BTRFS_FS_OPEN, &fs_info->flags); - /* - * backuproot only affect mount behavior, and if open_ctree succeeded, - * no need to keep the flag - */ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - return 0; fail_qgroup: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5095a23befd..b9d5d610682f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1437,6 +1437,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) print_rescue_option(seq, "nologreplay", &printed); + if (btrfs_test_opt(info, USEBACKUPROOT)) + print_rescue_option(seq, "usebackuproot", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) -- cgit v1.2.3 From 42437a6386ffeaaf200731e73d723ea491f3fe7d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:18 -0400 Subject: btrfs: introduce mount option rescue=ignorebadroots In the face of extent root corruption, or any other core fs wide root corruption we will fail to mount the file system. This makes recovery kind of a pain, because you need to fall back to userspace tools to scrape off data. Instead provide a mechanism to gracefully handle bad roots, so we can at least mount read-only and possibly recover data from the file system. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 48 +++++++++++++++++++++++++++++++++++++ fs/btrfs/block-rsv.c | 8 +++++++ fs/btrfs/compression.c | 2 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 65 +++++++++++++++++++++++++++++++------------------- fs/btrfs/file-item.c | 2 +- fs/btrfs/inode.c | 6 ++++- fs/btrfs/super.c | 12 +++++++++- fs/btrfs/sysfs.c | 1 + fs/btrfs/volumes.c | 13 ++++++++++ 10 files changed, 130 insertions(+), 28 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3ba6f3839d39..bb6685711824 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1985,6 +1985,51 @@ error: return ret; } +static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct btrfs_space_info *space_info; + struct rb_node *node; + int ret = 0; + + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { + struct extent_map *em; + struct map_lookup *map; + struct btrfs_block_group *bg; + + em = rb_entry(node, struct extent_map, rb_node); + map = em->map_lookup; + bg = btrfs_create_block_group_cache(fs_info, em->start); + if (!bg) { + ret = -ENOMEM; + break; + } + + /* Fill dummy cache as FULL */ + bg->length = em->len; + bg->flags = map->type; + bg->last_byte_to_unpin = (u64)-1; + bg->cached = BTRFS_CACHE_FINISHED; + bg->used = em->len; + bg->flags = map->type; + ret = btrfs_add_block_group_cache(fs_info, bg); + if (ret) { + btrfs_remove_free_space_cache(bg); + btrfs_put_block_group(bg); + break; + } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, + 0, &space_info); + bg->space_info = space_info; + link_block_group(bg); + + set_avail_alloc_bits(fs_info, bg->flags); + } + if (!ret) + btrfs_init_global_block_rsv(fs_info); + return ret; +} + int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; @@ -1995,6 +2040,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; + if (!info->extent_root) + return fill_dummy_bgs(info); + key.objectid = 0; key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index bc920afe23bf..04a6226e0388 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* + * Our various recovery options can leave us with NULL roots, so check + * here and just bail before we go dereferencing NULLs everywhere. + */ + if (!fs_info->extent_root || !fs_info->csum_root || + !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) + return; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7e1eb57b923c..972fb68a85ac 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -150,7 +150,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (inode->flags & BTRFS_INODE_NODATASUM) + if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) return 0; shash->tfm = fs_info->csum_shash; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0738ec94d806..683dcb58eaa9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1298,6 +1298,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) +#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 137c4d5eaa8d..d229f6c25f29 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2307,30 +2307,39 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; /* * This tree can share blocks with some other fs tree during relocation @@ -2339,11 +2348,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); @@ -2356,9 +2368,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - if (ret != -ENOENT) - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + goto out; + } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2368,11 +2382,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; } return 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8083d71d6af6..816f57d52fc9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -272,7 +272,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; path = btrfs_alloc_path(); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c8cda5df9fb0..21a354dad6f2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2187,7 +2187,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + !fs_info->csum_root; if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2902,6 +2903,9 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; + if (!root->fs_info->csum_root) + return 0; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b9d5d610682f..9bc46c048978 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -360,6 +360,7 @@ enum { Opt_rescue, Opt_usebackuproot, Opt_nologreplay, + Opt_ignorebadroots, /* Deprecated options */ Opt_recovery, @@ -455,6 +456,8 @@ static const match_table_t tokens = { static const match_table_t rescue_tokens = { {Opt_usebackuproot, "usebackuproot"}, {Opt_nologreplay, "nologreplay"}, + {Opt_ignorebadroots, "ignorebadroots"}, + {Opt_ignorebadroots, "ibadroots"}, {Opt_err, NULL}, }; @@ -498,6 +501,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; + case Opt_ignorebadroots: + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; @@ -983,7 +990,8 @@ check: if (new_flags & SB_RDONLY) goto out; - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay")) + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots")) ret = -EINVAL; out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && @@ -1439,6 +1447,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "nologreplay", &printed); if (btrfs_test_opt(info, USEBACKUPROOT)) print_rescue_option(seq, "usebackuproot", &printed); + if (btrfs_test_opt(info, IGNOREBADROOTS)) + print_rescue_option(seq, "ignorebadroots", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8f0462d6855d..e9f482989415 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -332,6 +332,7 @@ BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); static const char *rescue_opts[] = { "usebackuproot", "nologreplay", + "ignorebadroots", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78637665166e..07c6b0c85339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7659,6 +7659,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) u64 prev_dev_ext_end = 0; int ret = 0; + /* + * We don't have a dev_root because we mounted with ignorebadroots and + * failed to load the root, so we want to skip the verification in this + * case for sure. + * + * However if the dev root is fine, but the tree itself is corrupted + * we'd still fail to mount. This verification is only to make sure + * writes can happen safely, so instead just bypass this check + * completely in the case of IGNOREBADROOTS. + */ + if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) + return 0; + key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; -- cgit v1.2.3 From 882dbe0cec9651bf6a6df500178149453726c1e1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:19 -0400 Subject: btrfs: introduce mount option rescue=ignoredatacsums There are cases where you can end up with bad data csums because of misbehaving applications. This happens when an application modifies a buffer in-flight when doing an O_DIRECT write. In order to recover the file we need a way to turn off data checksums so you can copy the file off, and then you can delete the file and restore it properly later. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 21 ++++++++++++--------- fs/btrfs/super.c | 12 +++++++++++- fs/btrfs/sysfs.c | 1 + 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 683dcb58eaa9..39e4c35e965a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1299,6 +1299,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) #define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) +#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d229f6c25f29..646dde5883a6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2329,16 +2329,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) btrfs_init_devices_late(fs_info); } - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; + /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ + if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; } /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9bc46c048978..6bbd4f04b28d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -361,6 +361,7 @@ enum { Opt_usebackuproot, Opt_nologreplay, Opt_ignorebadroots, + Opt_ignoredatacsums, /* Deprecated options */ Opt_recovery, @@ -458,6 +459,8 @@ static const match_table_t rescue_tokens = { {Opt_nologreplay, "nologreplay"}, {Opt_ignorebadroots, "ignorebadroots"}, {Opt_ignorebadroots, "ibadroots"}, + {Opt_ignoredatacsums, "ignoredatacsums"}, + {Opt_ignoredatacsums, "idatacsums"}, {Opt_err, NULL}, }; @@ -505,6 +508,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, IGNOREBADROOTS, "ignoring bad roots"); break; + case Opt_ignoredatacsums: + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; @@ -991,7 +998,8 @@ check: goto out; if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots")) + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) ret = -EINVAL; out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && @@ -1449,6 +1457,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "usebackuproot", &printed); if (btrfs_test_opt(info, IGNOREBADROOTS)) print_rescue_option(seq, "ignorebadroots", &printed); + if (btrfs_test_opt(info, IGNOREDATACSUMS)) + print_rescue_option(seq, "ignoredatacsums", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e9f482989415..86f70a60447b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -333,6 +333,7 @@ static const char *rescue_opts[] = { "usebackuproot", "nologreplay", "ignorebadroots", + "ignoredatacsums", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, -- cgit v1.2.3 From 9037d3cbcbe1ed9c409efe4d6d3efd893d7ff05e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:20 -0400 Subject: btrfs: introduce mount option rescue=all Now that we have the building blocks for some better recovery options with corrupted file systems, add a rescue=all option to enable all of the relevant rescue options. This will allow distros to simply default to rescue=all for the "oh dear lord the world's on fire" recovery without needing to know all the different options that we have and may add in the future. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 11 +++++++++++ fs/btrfs/sysfs.c | 1 + 2 files changed, 12 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6bbd4f04b28d..1ffa50bae1dd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -362,6 +362,7 @@ enum { Opt_nologreplay, Opt_ignorebadroots, Opt_ignoredatacsums, + Opt_rescue_all, /* Deprecated options */ Opt_recovery, @@ -461,6 +462,7 @@ static const match_table_t rescue_tokens = { {Opt_ignorebadroots, "ibadroots"}, {Opt_ignoredatacsums, "ignoredatacsums"}, {Opt_ignoredatacsums, "idatacsums"}, + {Opt_rescue_all, "all"}, {Opt_err, NULL}, }; @@ -512,6 +514,15 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, IGNOREDATACSUMS, "ignoring data csums"); break; + case Opt_rescue_all: + btrfs_info(info, "enabling all of the rescue options"); + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 86f70a60447b..fcd6c7a9bbd1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -334,6 +334,7 @@ static const char *rescue_opts[] = { "nologreplay", "ignorebadroots", "ignoredatacsums", + "all", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, -- cgit v1.2.3 From ecdcf3c259e4c36ec6c81e7a807b4924be898b20 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 22 Oct 2020 18:40:46 +0300 Subject: btrfs: open code insert_orphan_item Just open code it in its sole caller and remove a level of indirection. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 56cbc1706b6f..135cb40295c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1564,18 +1564,6 @@ out: return ret; } -static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 ino) -{ - int ret; - - ret = btrfs_insert_orphan_item(trans, root, ino); - if (ret == -EEXIST) - ret = 0; - - return ret; -} - static int count_inode_extrefs(struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { @@ -1727,7 +1715,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (ret) goto out; } - ret = insert_orphan_item(trans, root, ino); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; } out: -- cgit v1.2.3 From 196d59ab9ccc975d8d29292845d227cdf4423ef8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:09 -0400 Subject: btrfs: switch extent buffer tree lock to rw_semaphore Historically we've implemented our own locking because we wanted to be able to selectively spin or sleep based on what we were doing in the tree. For instance, if all of our nodes were in cache then there's rarely a reason to need to sleep waiting for node locks, as they'll likely become available soon. At the time this code was written the rw_semaphore didn't do adaptive spinning, and thus was orders of magnitude slower than our home grown locking. However now the opposite is the case. There are a few problems with how we implement blocking locks, namely that we use a normal waitqueue and simply wake everybody up in reverse sleep order. This leads to some suboptimal performance behavior, and a lot of context switches in highly contended cases. The rw_semaphores actually do this properly, and also have adaptive spinning that works relatively well. The locking code is also a bit of a bear to understand, and we lose the benefit of lockdep for the most part because the blocking states of the lock are simply ad-hoc and not mapped into lockdep. So rework the locking code to drop all of this custom locking stuff, and simply use a rw_semaphore for everything. This makes the locking much simpler for everything, as we can now drop a lot of cruft and blocking transitions. The performance numbers vary depending on the workload, because generally speaking there doesn't tend to be a lot of contention on the btree. However, on my test system which is an 80 core single socket system with 256GiB of RAM and a 2TiB NVMe drive I get the following results (with all debug options off): dbench 200 baseline Throughput 216.056 MB/sec 200 clients 200 procs max_latency=1471.197 ms dbench 200 with patch Throughput 737.188 MB/sec 200 clients 200 procs max_latency=714.346 ms Previously we also used fs_mark to test this sort of contention, and those results are far less impressive, mostly because there's not enough tasks to really stress the locking fs_mark -d /d[0-15] -S 0 -L 20 -n 100000 -s 0 -t 16 baseline Average Files/sec: 160166.7 p50 Files/sec: 165832 p90 Files/sec: 123886 p99 Files/sec: 123495 real 3m26.527s user 2m19.223s sys 48m21.856s patched Average Files/sec: 164135.7 p50 Files/sec: 171095 p90 Files/sec: 122889 p99 Files/sec: 113819 real 3m29.660s user 2m19.990s sys 44m12.259s Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +- fs/btrfs/extent_io.h | 21 +-- fs/btrfs/locking.c | 374 +++++++++----------------------------------------- fs/btrfs/locking.h | 2 +- fs/btrfs/print-tree.c | 11 +- 5 files changed, 70 insertions(+), 351 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14d01b76f5c9..cf3f576d6277 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4946,12 +4946,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->len = len; eb->fs_info = fs_info; eb->bflags = 0; - rwlock_init(&eb->lock); - atomic_set(&eb->blocking_readers, 0); - eb->blocking_writers = 0; + init_rwsem(&eb->lock); eb->lock_recursed = false; - init_waitqueue_head(&eb->write_lock_wq); - init_waitqueue_head(&eb->read_lock_wq); btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); @@ -4967,13 +4963,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, > MAX_INLINE_EXTENT_BUFFER_SIZE); BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); -#ifdef CONFIG_BTRFS_DEBUG - eb->spinning_writers = 0; - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->read_locks, 0); - eb->write_locks = 0; -#endif - return eb; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f39d02e7f7ef..3801eb3b726e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -87,31 +87,14 @@ struct extent_buffer { int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; - - int blocking_writers; - atomic_t blocking_readers; bool lock_recursed; + struct rw_semaphore lock; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; - /* protects write locks */ - rwlock_t lock; - - /* readers use lock_wq while they wait for the write - * lock holders to unlock - */ - wait_queue_head_t write_lock_wq; - - /* writers use read_lock_wq while they wait for readers - * to unlock - */ - wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - int spinning_writers; - atomic_t spinning_readers; - atomic_t read_locks; - int write_locks; struct list_head leak_list; #endif }; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66e02ebdd340..60e0f00b9b8f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -17,44 +17,17 @@ * Extent buffer locking * ===================== * - * The locks use a custom scheme that allows to do more operations than are - * available fromt current locking primitives. The building blocks are still - * rwlock and wait queues. - * - * Required semantics: + * We use a rw_semaphore for tree locking, and the semantics are exactly the + * same: * * - reader/writer exclusion * - writer/writer exclusion * - reader/reader sharing - * - spinning lock semantics - * - blocking lock semantics * - try-lock semantics for readers and writers - * - one level nesting, allowing read lock to be taken by the same thread that - * already has write lock - * - * The extent buffer locks (also called tree locks) manage access to eb data - * related to the storage in the b-tree (keys, items, but not the individual - * members of eb). - * We want concurrency of many readers and safe updates. The underlying locking - * is done by read-write spinlock and the blocking part is implemented using - * counters and wait queues. - * - * spinning semantics - the low-level rwlock is held so all other threads that - * want to take it are spinning on it. - * - * blocking semantics - the low-level rwlock is not held but the counter - * denotes how many times the blocking lock was held; - * sleeping is possible - * - * Write lock always allows only one thread to access the data. - * * - * Debugging - * --------- - * - * There are additional state counters that are asserted in various contexts, - * removed from non-debug build to reduce extent_buffer size and for - * performance reasons. + * Additionally we need one level nesting recursion, see below. The rwsem + * implementation does opportunistic spinning which reduces number of times the + * locking task needs to sleep. * * * Lock recursion @@ -75,115 +48,8 @@ * btrfs_lookup_file_extent * btrfs_search_slot * - * - * Locking pattern - spinning - * -------------------------- - * - * The simple locking scenario, the +--+ denotes the spinning section. - * - * +- btrfs_tree_lock - * | - extent_buffer::rwlock is held - * | - no heavy operations should happen, eg. IO, memory allocations, large - * | structure traversals - * +- btrfs_tree_unock -* -* - * Locking pattern - blocking - * -------------------------- - * - * The blocking write uses the following scheme. The +--+ denotes the spinning - * section. - * - * +- btrfs_tree_lock - * | - * +- btrfs_set_lock_blocking_write - * - * - allowed: IO, memory allocations, etc. - * - * -- btrfs_tree_unlock - note, no explicit unblocking necessary - * - * - * Blocking read is similar. - * - * +- btrfs_tree_read_lock - * | - * +- btrfs_set_lock_blocking_read - * - * - heavy operations allowed - * - * +- btrfs_tree_read_unlock_blocking - * | - * +- btrfs_tree_read_unlock - * */ -#ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); - eb->spinning_writers++; -} - -static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers != 1); - eb->spinning_writers--; -} - -static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); -} - -static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->spinning_readers); -} - -static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) -{ - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); -} - -static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) -{ - atomic_dec(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->read_locks)); -} - -static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) -{ - eb->write_locks++; -} - -static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) -{ - eb->write_locks--; -} - -#else -static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } -static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } -#endif - /* * Mark already held read lock as blocking. Can be nested in write lock by the * same thread. @@ -195,18 +61,6 @@ static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } */ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { - trace_btrfs_set_lock_blocking_read(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) - return; - btrfs_assert_tree_read_locked(eb); - atomic_inc(&eb->blocking_readers); - btrfs_assert_spinning_readers_put(eb); - read_unlock(&eb->lock); } /* @@ -219,30 +73,20 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) */ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) { - trace_btrfs_set_lock_blocking_write(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) - return; - if (eb->blocking_writers == 0) { - btrfs_assert_spinning_writers_put(eb); - btrfs_assert_tree_locked(eb); - WRITE_ONCE(eb->blocking_writers, 1); - write_unlock(&eb->lock); - } } /* - * Lock the extent buffer for read. Wait for any writers (spinning or blocking). - * Can be nested in write lock by the same thread. + * __btrfs_tree_read_lock - lock extent buffer for read + * @eb: the eb to be locked + * @nest: the nesting level to be used for lockdep + * @recurse: if this lock is able to be recursed * - * Use when the locked section does only lightweight actions and busy waiting - * would be cheaper than making other threads do the wait/wake loop. + * This takes the read lock on the extent buffer, using the specified nesting + * level for lockdep purposes. * - * The rwlock is held upon exit. + * If you specify recurse = true, then we will allow this to be taken if we + * currently own the lock already. This should only be used in specific + * usecases, and the subsequent unlock will not change the state of the lock. */ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, bool recurse) @@ -251,33 +95,33 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); -again: - read_lock(&eb->lock); - BUG_ON(eb->blocking_writers == 0 && - current->pid == eb->lock_owner); - if (eb->blocking_writers) { - if (current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. - * We allow an additional read lock to be added because - * it's for the same thread. btrfs_find_all_roots() - * depends on this as it may be called on a partly - * (write-)locked tree. - */ - WARN_ON(!recurse); - BUG_ON(eb->lock_recursed); - eb->lock_recursed = true; - read_unlock(&eb->lock); - trace_btrfs_tree_read_lock(eb, start_ns); - return; + + if (unlikely(recurse)) { + /* First see if we can grab the lock outright */ + if (down_read_trylock(&eb->lock)) + goto out; + + /* + * Ok still doesn't necessarily mean we are already holding the + * lock, check the owner. + */ + if (eb->lock_owner != current->pid) { + down_read_nested(&eb->lock, nest); + goto out; } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - READ_ONCE(eb->blocking_writers) == 0); - goto again; + + /* + * Ok we have actually recursed, but we should only be recursing + * once, so blow up if we're already recursed, otherwise set + * ->lock_recursed and carry on. + */ + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; + goto out; } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); + down_read_nested(&eb->lock, nest); +out: + eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -294,74 +138,42 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (READ_ONCE(eb->blocking_writers)) - return 0; - - read_lock(&eb->lock); - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers)) { - read_unlock(&eb->lock); - return 0; - } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); - trace_btrfs_tree_read_lock_atomic(eb); - return 1; + return btrfs_try_tree_read_lock(eb); } /* - * Try-lock for read. Don't block or wait for contending writers. + * Try-lock for read. * * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (READ_ONCE(eb->blocking_writers)) - return 0; - - if (!read_trylock(&eb->lock)) - return 0; - - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers)) { - read_unlock(&eb->lock); - return 0; + if (down_read_trylock(&eb->lock)) { + eb->lock_owner = current->pid; + trace_btrfs_try_tree_read_lock(eb); + return 1; } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); - trace_btrfs_try_tree_read_lock(eb); - return 1; + return 0; } /* - * Try-lock for write. May block until the lock is uncontended, but does not - * wait until it is free. + * Try-lock for write. * * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) - return 0; - - write_lock(&eb->lock); - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { - write_unlock(&eb->lock); - return 0; + if (down_write_trylock(&eb->lock)) { + eb->lock_owner = current->pid; + trace_btrfs_try_tree_write_lock(eb); + return 1; } - btrfs_assert_tree_write_locks_get(eb); - btrfs_assert_spinning_writers_get(eb); - eb->lock_owner = current->pid; - trace_btrfs_try_tree_write_lock(eb); - return 1; + return 0; } /* - * Release read lock. Must be used only if the lock is in spinning mode. If - * the read lock is nested, must pair with read lock before the write unlock. - * - * The rwlock is not held upon exit. + * Release read lock. If the read lock was recursed then the lock stays in the + * original state that it was before it was recursively locked. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { @@ -376,10 +188,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) eb->lock_recursed = false; return; } - btrfs_assert_tree_read_locked(eb); - btrfs_assert_spinning_readers_put(eb); - btrfs_assert_tree_read_locks_put(eb); - read_unlock(&eb->lock); + eb->lock_owner = 0; + up_read(&eb->lock); } /* @@ -391,30 +201,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { - trace_btrfs_tree_read_unlock_blocking(eb); - /* - * if we're nested, we have the write lock. No new locking - * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_recursed - * field only matters to the lock owner. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) { - eb->lock_recursed = false; - return; - } - btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->blocking_readers) == 0); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); - btrfs_assert_tree_read_locks_put(eb); + btrfs_tree_read_unlock(eb); } /* - * Lock for write. Wait for all blocking and spinning readers and writers. This - * starts context where reader lock could be nested by the same thread. + * __btrfs_tree_lock - lock eb for write + * @eb: the eb to lock + * @nest: the nesting to use for the lock * - * The rwlock is held for write upon exit. + * Returns with the eb->lock write locked. */ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) @@ -424,19 +219,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) if (trace_btrfs_tree_lock_enabled()) start_ns = ktime_get_ns(); - WARN_ON(eb->lock_owner == current->pid); -again: - wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); - write_lock(&eb->lock); - /* Refetch value after lock */ - if (atomic_read(&eb->blocking_readers) || - READ_ONCE(eb->blocking_writers)) { - write_unlock(&eb->lock); - goto again; - } - btrfs_assert_spinning_writers_get(eb); - btrfs_assert_tree_write_locks_get(eb); + down_write_nested(&eb->lock, nest); eb->lock_owner = current->pid; trace_btrfs_tree_lock(eb, start_ns); } @@ -447,42 +230,13 @@ void btrfs_tree_lock(struct extent_buffer *eb) } /* - * Release the write lock, either blocking or spinning (ie. there's no need - * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). - * This also ends the context for nesting, the read lock must have been - * released already. - * - * Tasks blocked and waiting are woken, rwlock is not held upon exit. + * Release the write lock. */ void btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * This is read both locked and unlocked but always by the same thread - * that already owns the lock so we don't need to use READ_ONCE - */ - int blockers = eb->blocking_writers; - - BUG_ON(blockers > 1); - - btrfs_assert_tree_locked(eb); trace_btrfs_tree_unlock(eb); eb->lock_owner = 0; - btrfs_assert_tree_write_locks_put(eb); - - if (blockers) { - btrfs_assert_no_spinning_writers(eb); - /* Unlocked write */ - WRITE_ONCE(eb->blocking_writers, 0); - /* - * We need to order modifying blocking_writers above with - * actually waking up the sleepers to ensure they see the - * updated value of blocking_writers - */ - cond_wake_up(&eb->write_lock_wq); - } else { - btrfs_assert_spinning_writers_put(eb); - write_unlock(&eb->lock); - } + up_write(&eb->lock); } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3ea81ed3320b..7c27f142f7d2 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -110,7 +110,7 @@ static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { - BUG_ON(!eb->write_locks); + lockdep_assert_held(&eb->lock); } #else static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 7695c4783d33..48b57dd57436 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -191,15 +191,8 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, static void print_eb_refs_lock(struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG - btrfs_info(eb->fs_info, -"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), eb->write_locks, - atomic_read(&eb->read_locks), - eb->blocking_writers, - atomic_read(&eb->blocking_readers), - eb->spinning_writers, - atomic_read(&eb->spinning_readers), - eb->lock_owner, current->pid); + btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", + atomic_read(&eb->refs), eb->lock_owner, current->pid); #endif } -- cgit v1.2.3 From 88090ad36a64af1eb5b78d26b2ccd07eedae80b5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Oct 2020 12:40:06 +0000 Subject: btrfs: do not start and wait for delalloc on snapshot roots on transaction commit We do not need anymore to start writeback for delalloc of roots that are being snapshotted and wait for it to complete. This was done in commit 609e804d771f59 ("Btrfs: fix file corruption after snapshotting due to mix of buffered/DIO writes") to fix a type of file corruption where files in a snapshot end up having their i_size updated in a non-ordered way, leaving implicit file holes, when buffered IO writes that increase a file's size are followed by direct IO writes that also increase the file's size. This is not needed anymore because we now have a more generic mechanism to prevent a non-ordered i_size update since commit 9ddc959e802bf7 ("btrfs: use the file extent tree infrastructure"), which addresses this scenario involving snapshots as well. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 49 ++++++------------------------------------------- 1 file changed, 6 insertions(+), 43 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 52ada47aff50..8f70d7135497 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1956,10 +1956,8 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) } } -static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - /* * We use writeback_inodes_sb here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. @@ -1969,50 +1967,15 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. */ - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); - } else { - struct btrfs_pending_snapshot *pending; - struct list_head *head = &trans->transaction->pending_snapshots; - - /* - * Flush dellaloc for any root that is going to be snapshotted. - * This is done to avoid a corrupted version of files, in the - * snapshots, that had both buffered and direct IO writes (even - * if they were done sequentially) due to an unordered update of - * the inode's size on disk. - */ - list_for_each_entry(pending, head, list) { - int ret; - - ret = btrfs_start_delalloc_snapshot(pending->root); - if (ret) - return ret; - } - } return 0; } -static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans) +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - } else { - struct btrfs_pending_snapshot *pending; - struct list_head *head = &trans->transaction->pending_snapshots; - - /* - * Wait for any dellaloc that we started previously for the roots - * that are going to be snapshotted. This is to avoid a corrupted - * version of files in the snapshots that had both buffered and - * direct IO writes (even if they were done sequentially). - */ - list_for_each_entry(pending, head, list) - btrfs_wait_ordered_extents(pending->root, - U64_MAX, 0, U64_MAX); - } } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2150,7 +2113,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(trans); + ret = btrfs_start_delalloc_flush(fs_info); if (ret) goto cleanup_transaction; @@ -2166,7 +2129,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(trans); + btrfs_wait_delalloc_flush(fs_info); /* * Wait for all ordered extents started by a fast fsync that joined this -- cgit v1.2.3 From aaefed207875a0f0c46c4a50dcd0aca0d56b9062 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 28 Oct 2020 21:14:45 +0800 Subject: btrfs: add helper for string match ignoring leading/trailing whitespace Add a generic helper to match the string in a given buffer, and ignore the leading and trailing whitespace. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba [ rename variables, add comments ] Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fcd6c7a9bbd1..b2f653eb46cb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -888,6 +888,25 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, } BTRFS_ATTR(, generation, btrfs_generation_show); +/* + * Look for an exact string @string in @buffer with possible leading or + * trailing whitespace + */ +static bool strmatch(const char *buffer, const char *string) +{ + const size_t len = strlen(string); + + /* Skip leading whitespace */ + buffer = skip_spaces(buffer); + + /* Match entire string, check if the rest is whitespace or empty */ + if (strncmp(string, buffer, len) == 0 && + strlen(skip_spaces(buffer + len)) == 0) + return true; + + return false; +} + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), -- cgit v1.2.3 From 33fd2f714cde793b337b4ca6aaf3a8c68675d74f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 28 Oct 2020 21:14:46 +0800 Subject: btrfs: create read policy framework As of now, we use the pid method to read striped mirrored data, which means process id determines the stripe id to read. This type of routing typically helps in a system with many small independent processes tying to read random data. On the other hand, the pid based read IO policy is inefficient because if there is a single process trying to read a large file, the overall disk bandwidth remains underutilized. So this patch introduces a read policy framework so that we could add more read policies, such as IO routing based on the device's wait-queue or manual when we have a read-preferred device or a policy based on the target storage caching. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 +++++++++++++- fs/btrfs/volumes.h | 13 +++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 07c6b0c85339..ed4163b54f75 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1217,6 +1217,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + fs_devices->read_policy = BTRFS_READ_POLICY_PID; return 0; } @@ -5479,7 +5480,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - preferred_mirror = first + current->pid % num_stripes; + switch (fs_info->fs_devices->read_policy) { + default: + /* Shouldn't happen, just warn and use pid instead of failing */ + btrfs_warn_rl(fs_info, + "unknown read_policy type %u, reset to pid", + fs_info->fs_devices->read_policy); + fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + fallthrough; + case BTRFS_READ_POLICY_PID: + preferred_mirror = first + (current->pid % num_stripes); + break; + } if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 232f02bd214f..34fd440fd30b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -211,6 +211,16 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, }; +/* + * Read policies for mirrored block group profiles, read picks the stripe based + * on these policies. + */ +enum btrfs_read_policy { + /* Use process PID to choose the stripe */ + BTRFS_READ_POLICY_PID, + BTRFS_NR_READ_POLICY, +}; + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -264,6 +274,9 @@ struct btrfs_fs_devices { struct completion kobj_unregister; enum btrfs_chunk_allocation_policy chunk_alloc_policy; + + /* Policy used to read the mirrored stripes */ + enum btrfs_read_policy read_policy; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 -- cgit v1.2.3 From 3d8cc17a0561dc4c037ede4886d7975009075d6d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 28 Oct 2020 21:14:47 +0800 Subject: btrfs: sysfs: add per-fs attribute for read policy Add /sys/fs/btrfs/UUID/read_policy attribute so that the read policy for the raid1, raid1c34 and raid10 can be tuned. When this attribute is read, it will show all available policies, with active policy in [ ]. The read_policy attribute can be written using one of the items listed in there. For example: $ cat /sys/fs/btrfs/UUID/read_policy [pid] $ echo pid > /sys/fs/btrfs/UUID/read_policy Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b2f653eb46cb..620fddfd7096 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -907,6 +907,54 @@ static bool strmatch(const char *buffer, const char *string) return false; } +static const char * const btrfs_read_policy_name[] = { "pid" }; + +static ssize_t btrfs_read_policy_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + ssize_t ret = 0; + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (fs_devices->read_policy == i) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + else + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} + +static ssize_t btrfs_read_policy_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (strmatch(buf, btrfs_read_policy_name[i])) { + if (i != fs_devices->read_policy) { + fs_devices->read_policy = i; + btrfs_info(fs_devices->fs_info, + "read policy set to '%s'", + btrfs_read_policy_name[i]); + } + return len; + } + } + + return -EINVAL; +} +BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -917,6 +965,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), + BTRFS_ATTR_PTR(, read_policy), NULL, }; -- cgit v1.2.3 From 4e4cabece9f9c6b8dde8baf8f81e90222aa4989b Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:12 -0500 Subject: btrfs: split btrfs_direct_IO to read and write The read and write DIO don't have anything in common except for the call to iomap_dio_rw. Extract the write call into a new function to get rid of conditional statements for direct write. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 ++- fs/btrfs/file.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++------ fs/btrfs/inode.c | 91 ++-------------------------------------------------- 3 files changed, 94 insertions(+), 99 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 39e4c35e965a..5244400d5c23 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" @@ -3065,7 +3066,9 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); +extern const struct iomap_ops btrfs_dio_iomap_ops; +extern const struct iomap_dio_ops btrfs_dio_ops; +extern const struct iomap_dio_ops btrfs_sync_dops; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5e2b0592abea..6c12f5703faa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1782,21 +1782,67 @@ again: return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const u32 blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - loff_t pos; - ssize_t written; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + bool relock = false; ssize_t written_buffered; loff_t endbyte; int err; - written = btrfs_direct_IO(iocb, from); + if (check_direct_IO(fs_info, from, pos)) + goto buffered; + + /* + * If the write DIO is beyond EOF, we need to update the isize, but it + * is protected by inode lock. So we cannot unlock it here. + */ + if (pos + iov_iter_count(from) <= inode->i_size) { + inode_unlock(inode); + relock = true; + } + down_read(&BTRFS_I(inode)->dio_sem); + + /* + * This is actually a sync iocb, so we need our fancy endio to know if + * we need to sync. + */ + if (current->journal_info) + written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (written == -ENOTBLK) + written = 0; + + up_read(&BTRFS_I(inode)->dio_sem); + if (relock) + inode_lock(inode); if (written < 0 || !iov_iter_count(from)) return written; +buffered: pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -1970,7 +2016,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_flags &= ~IOCB_DSYNC; current->journal_info = BTRFS_DIO_SYNC_STUB; } - num_written = __btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); /* * As stated above, we cleared journal_info, so we need to do @@ -3545,16 +3591,47 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + if (!iter_is_iovec(iter)) + return 0; + + for (seg = 0; seg < iter->nr_segs; seg++) + for (i = seg + 1; i < iter->nr_segs; i++) + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + return -EINVAL; + return 0; +} + +static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + return 0; + + inode_lock_shared(inode); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + return ret; +} + static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; if (iocb->ki_flags & IOCB_DIRECT) { - struct inode *inode = file_inode(iocb->ki_filp); - - inode_lock_shared(inode); - ret = btrfs_direct_IO(iocb, to); - inode_unlock_shared(inode); + ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 21a354dad6f2..6fd561ac9866 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7954,39 +7954,6 @@ out_err: return BLK_QC_T_NONE; } -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int seg; - int i; - unsigned int blocksize_mask = fs_info->sectorsize - 1; - ssize_t retval = -EINVAL; - - if (offset & blocksize_mask) - goto out; - - if (iov_iter_alignment(iter) & blocksize_mask) - goto out; - - /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) - return 0; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) - goto out; - } - } - retval = 0; -out: - return retval; -} - static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned flags) { @@ -8011,72 +7978,20 @@ static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, return 0; } -static const struct iomap_ops btrfs_dio_iomap_ops = { +const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -static const struct iomap_dio_ops btrfs_dio_ops = { +const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; -static const struct iomap_dio_ops btrfs_sync_dops = { +const struct iomap_dio_ops btrfs_sync_dops = { .submit_io = btrfs_submit_direct, .end_io = btrfs_maybe_fsync_end_io, }; -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_changeset *data_reserved = NULL; - loff_t offset = iocb->ki_pos; - size_t count = 0; - bool relock = false; - ssize_t ret; - - if (check_direct_IO(fs_info, iter, offset)) - return 0; - - count = iov_iter_count(iter); - if (iov_iter_rw(iter) == WRITE) { - /* - * If the write DIO is beyond the EOF, we need update - * the isize, but it is protected by i_mutex. So we can - * not unlock the i_mutex at this case. - */ - if (offset + count <= inode->i_size) { - inode_unlock(inode); - relock = true; - } - down_read(&BTRFS_I(inode)->dio_sem); - } - - /* - * We have are actually a sync iocb, so we need our fancy endio to know - * if we need to sync. - */ - if (current->journal_info) - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_sync_dops, is_sync_kiocb(iocb)); - else - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_dio_ops, is_sync_kiocb(iocb)); - - if (ret == -ENOTBLK) - ret = 0; - - if (iov_iter_rw(iter) == WRITE) - up_read(&BTRFS_I(inode)->dio_sem); - - if (relock) - inode_lock(inode); - - extent_changeset_free(data_reserved); - return ret; -} - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { -- cgit v1.2.3 From 5e8b9ef30392bb80f418cf4340b8c9c54039d5dc Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:13 -0500 Subject: btrfs: move pos increment and pagecache extension to btrfs_buffered_write While we do this, correct the call to pagecache_isize_extended: - pagecache_isize_extended needs to be called to the start of the write as opposed to i_size - we don't need to check range before the call, this is done in the function Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6c12f5703faa..e80d7293b3d7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1581,6 +1581,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; + loff_t old_isize = i_size_read(inode); nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); @@ -1779,6 +1780,10 @@ again: } extent_changeset_free(data_reserved); + if (num_written > 0) { + pagecache_isize_extended(inode, old_isize, iocb->ki_pos); + iocb->ki_pos += num_written; + } return num_written ? num_written : ret; } @@ -1901,7 +1906,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; loff_t oldsize; - int clean_page = 0; if (!(iocb->ki_flags & IOCB_DIRECT) && (iocb->ki_flags & IOCB_NOWAIT)) @@ -1983,8 +1987,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, inode_unlock(inode); goto out; } - if (start_pos > round_up(oldsize, fs_info->sectorsize)) - clean_page = 1; } if (sync) @@ -2027,11 +2029,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); - if (num_written > 0) - iocb->ki_pos = pos + num_written; - if (clean_page) - pagecache_isize_extended(inode, oldsize, - i_size_read(inode)); } inode_unlock(inode); -- cgit v1.2.3 From c86537a42f8661a26c96cbb32352b33cb57ac75c Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:14 -0500 Subject: btrfs: check FS error state bit early during write fs_info::fs_state is a filesystem bit check as opposed to inode and can be performed before we begin with write checks. This eliminates inode lock/unlock in case the error bit is set. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e80d7293b3d7..f5d02b4cdc05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1907,6 +1907,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, size_t count; loff_t oldsize; + /* + * If the fs flips readonly due to some impossible error, although we + * have opened a file as writable, we have to stop this write operation + * to ensure consistency. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + return -EROFS; + if (!(iocb->ki_flags & IOCB_DIRECT) && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; @@ -1956,18 +1964,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, goto out; } - /* - * If BTRFS flips readonly due to some impossible error - * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), - * although we have opened a file as writable, we have - * to stop this write operation to ensure FS consistency. - */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - inode_unlock(inode); - err = -EROFS; - goto out; - } - /* * We reserve space for updating the inode when we reserve space for the * extent we are going to write, so we will enospc out there. We don't -- cgit v1.2.3 From b8d8e1fd570a194904f545b135efc880d96a41a4 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:15 -0500 Subject: btrfs: introduce btrfs_write_check() btrfs_write_check() checks write parameters in one place before beginning a write. This does away with inode_unlock() after every check. In the later patches, it will help push inode_lock/unlock() in buffered and direct write functions respectively. generic_write_checks needs to be called before as it could truncate iov_iter and its return used as count. Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/file.c | 151 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 77 insertions(+), 74 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f5d02b4cdc05..b30ce769ae8b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1564,6 +1564,79 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } +static void update_time_for_write(struct inode *inode) +{ + struct timespec64 now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_time(inode); + if (!timespec64_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec64_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + +static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, + size_t count) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos = iocb->ki_pos; + int ret; + loff_t oldsize; + loff_t start_pos; + + if (iocb->ki_flags & IOCB_NOWAIT) { + size_t nocow_bytes = count; + + /* We will allocate space in case nodatacow is not set, so bail */ + if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) + return -EAGAIN; + /* + * There are holes in the range or parts of the range that must + * be COWed (shared extents, RO block groups, etc), so just bail + * out. + */ + if (nocow_bytes < count) + return -EAGAIN; + } + + current->backing_dev_info = inode_to_bdi(inode); + ret = file_remove_privs(file); + if (ret) + return ret; + + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); + + start_pos = round_down(pos, fs_info->sectorsize); + oldsize = i_size_read(inode); + if (start_pos > oldsize) { + /* Expand hole size to cover write data, preventing empty gap */ + loff_t end_pos = round_up(pos + count, fs_info->sectorsize); + + ret = btrfs_cont_expand(inode, oldsize, end_pos); + if (ret) { + current->backing_dev_info = NULL; + return ret; + } + } + + return 0; +} + static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { @@ -1873,24 +1946,6 @@ out: return written ? written : err; } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; - - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -1898,14 +1953,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 start_pos; - u64 end_pos; ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; ssize_t err; - loff_t pos; - size_t count; - loff_t oldsize; /* * If the fs flips readonly due to some impossible error, although we @@ -1932,57 +1982,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return err; } - pos = iocb->ki_pos; - count = iov_iter_count(from); - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* - * We will allocate space in case nodatacow is not set, - * so bail - */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) - <= 0) { - inode_unlock(inode); - return -EAGAIN; - } - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) { - inode_unlock(inode); - return -EAGAIN; - } - } - - current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_privs(file); - if (err) { + err = btrfs_write_check(iocb, from, err); + if (err < 0) { inode_unlock(inode); - goto out; - } - - /* - * We reserve space for updating the inode when we reserve space for the - * extent we are going to write, so we will enospc out there. We don't - * need to start yet another transaction to update the inode as we will - * update the inode when we finish writing whatever data we write. - */ - update_time_for_write(inode); - - start_pos = round_down(pos, fs_info->sectorsize); - oldsize = i_size_read(inode); - if (start_pos > oldsize) { - /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + count, - fs_info->sectorsize); - err = btrfs_cont_expand(inode, oldsize, end_pos); - if (err) { - inode_unlock(inode); - goto out; - } + return err; } if (sync) @@ -2042,7 +2045,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); -out: + current->backing_dev_info = NULL; return num_written ? num_written : err; } -- cgit v1.2.3 From a14b78ad06aba0fa7e76d2bc13c5ba581a7f331a Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:16 -0500 Subject: btrfs: introduce btrfs_inode_lock()/unlock() btrfs_inode_lock/unlock() are wrappers around inode locks, separating the type of lock and actual locking. - 0 - default, exclusive lock - BTRFS_ILOCK_SHARED - for shared locks, for possible parallel DIO - BTRFS_ILOCK_TRY - for the RWF_NOWAIT sequence The bits SHARED and TRY can be combined together. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/file.c | 33 +++++++++++++++++---------------- fs/btrfs/inode.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5244400d5c23..6ee7252f51c0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3070,6 +3070,13 @@ extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; extern const struct iomap_dio_ops btrfs_sync_dops; +/* Inode locking type flags, by default the exclusive lock is taken */ +#define BTRFS_ILOCK_SHARED (1U << 0) +#define BTRFS_ILOCK_TRY (1U << 1) + +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); + /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b30ce769ae8b..fe1daa44f325 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1894,7 +1894,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) * is protected by inode lock. So we cannot unlock it here. */ if (pos + iov_iter_count(from) <= inode->i_size) { - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); relock = true; } down_read(&BTRFS_I(inode)->dio_sem); @@ -1915,7 +1915,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) up_read(&BTRFS_I(inode)->dio_sem); if (relock) - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (written < 0 || !iov_iter_count(from)) return written; @@ -1956,6 +1956,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; ssize_t err; + unsigned int ilock_flags = 0; /* * If the fs flips readonly due to some impossible error, although we @@ -1969,22 +1970,22 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + err = btrfs_inode_lock(inode, ilock_flags); + if (err < 0) + return err; err = generic_write_checks(iocb, from); if (err <= 0) { - inode_unlock(inode); + btrfs_inode_unlock(inode, ilock_flags); return err; } err = btrfs_write_check(iocb, from, err); if (err < 0) { - inode_unlock(inode); + btrfs_inode_unlock(inode, ilock_flags); return err; } @@ -2030,7 +2031,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, num_written = btrfs_buffered_write(iocb, from); } - inode_unlock(inode); + btrfs_inode_unlock(inode, ilock_flags); /* * We also have to set last_sub_trans to the current log transid, @@ -3330,7 +3331,7 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3569,9 +3570,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - inode_lock_shared(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); offset = find_desired_extent(inode, offset, whence); - inode_unlock_shared(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; } @@ -3615,10 +3616,10 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; - inode_lock_shared(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, is_sync_kiocb(iocb)); - inode_unlock_shared(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6fd561ac9866..3eea0d0be8a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -95,6 +95,51 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate); +/* + * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * + * ilock_flags can have the following bit set: + * + * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode + * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt + * return -EAGAIN + */ +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_SHARED) { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + else + return 0; + } + inode_lock_shared(inode); + } else { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock(inode)) + return -EAGAIN; + else + return 0; + } + inode_lock(inode); + } + return 0; +} + +/* + * btrfs_inode_unlock - unock inode i_rwsem + * + * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() + * to decide whether the lock acquired is shared or exclusive. + */ +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_SHARED) + inode_unlock_shared(inode); + else + inode_unlock(inode); +} + /* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. -- cgit v1.2.3 From c352370633400d13765cc88080c969799ea51108 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:17 -0500 Subject: btrfs: push inode locking and unlocking into buffered/direct write Push inode locking and unlocking closer to where we perform the I/O. For this we need to move the write checks inside the respective functions as well. pos is evaluated after generic_write_checks because O_APPEND can change iocb->ki_pos. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 90 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fe1daa44f325..60cdad1b4952 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1641,7 +1641,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; + loff_t pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct page **pages = NULL; @@ -1651,18 +1651,37 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, u64 lockend; size_t num_written = 0; int nrptrs; - int ret = 0; + ssize_t ret; bool only_release_metadata = false; bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); + unsigned int ilock_flags = 0; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + ret = btrfs_inode_lock(inode, ilock_flags); + if (ret < 0) + return ret; + ret = generic_write_checks(iocb, i); + if (ret <= 0) + goto out; + + ret = btrfs_write_check(iocb, i, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return -ENOMEM; + if (!pages) { + ret = -ENOMEM; + goto out; + } while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; @@ -1857,6 +1876,8 @@ again: pagecache_isize_extended(inode, old_isize, iocb->ki_pos); iocb->ki_pos += num_written; } +out: + btrfs_inode_unlock(inode, ilock_flags); return num_written ? num_written : ret; } @@ -1879,15 +1900,39 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - loff_t pos = iocb->ki_pos; + loff_t pos; ssize_t written = 0; bool relock = false; ssize_t written_buffered; loff_t endbyte; - int err; + ssize_t err; + unsigned int ilock_flags = 0; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; - if (check_direct_IO(fs_info, from, pos)) + err = btrfs_inode_lock(inode, ilock_flags); + if (err < 0) + return err; + + err = generic_write_checks(iocb, from); + if (err <= 0) { + btrfs_inode_unlock(inode, ilock_flags); + return err; + } + + err = btrfs_write_check(iocb, from, err); + if (err < 0) { + btrfs_inode_unlock(inode, ilock_flags); + goto out; + } + + pos = iocb->ki_pos; + + if (check_direct_IO(fs_info, from, pos)) { + btrfs_inode_unlock(inode, ilock_flags); goto buffered; + } /* * If the write DIO is beyond EOF, we need to update the isize, but it @@ -1917,8 +1962,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (relock) btrfs_inode_lock(inode, 0); - if (written < 0 || !iov_iter_count(from)) - return written; + if (written < 0 || !iov_iter_count(from)) { + err = written; + goto out; + } buffered: pos = iocb->ki_pos; @@ -1955,8 +2002,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct btrfs_root *root = BTRFS_I(inode)->root; ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; - ssize_t err; - unsigned int ilock_flags = 0; /* * If the fs flips readonly due to some impossible error, although we @@ -1970,25 +2015,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (iocb->ki_flags & IOCB_NOWAIT) - ilock_flags |= BTRFS_ILOCK_TRY; - - err = btrfs_inode_lock(inode, ilock_flags); - if (err < 0) - return err; - - err = generic_write_checks(iocb, from); - if (err <= 0) { - btrfs_inode_unlock(inode, ilock_flags); - return err; - } - - err = btrfs_write_check(iocb, from, err); - if (err < 0) { - btrfs_inode_unlock(inode, ilock_flags); - return err; - } - if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); @@ -2031,8 +2057,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, num_written = btrfs_buffered_write(iocb, from); } - btrfs_inode_unlock(inode, ilock_flags); - /* * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this @@ -2048,7 +2072,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_dec(&BTRFS_I(inode)->sync_writers); current->backing_dev_info = NULL; - return num_written ? num_written : err; + return num_written; } int btrfs_release_file(struct inode *inode, struct file *filp) -- cgit v1.2.3 From e9adabb9712ef9424cbbeeaa027d962ab5262e19 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:18 -0500 Subject: btrfs: use shared lock for direct writes within EOF Direct writes within EOF are safe to be performed with inode shared lock to improve parallelization with other direct writes or reads because EOF is not changed and there is no race with truncate(). Direct reads are already performed under shared inode lock. This patch is precursor to removing btrfs_inode->dio_sem. Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 60cdad1b4952..dc54b2b38d14 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1902,7 +1902,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos; ssize_t written = 0; - bool relock = false; ssize_t written_buffered; loff_t endbyte; ssize_t err; @@ -1911,6 +1910,11 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; + /* If the write DIO is within EOF, use a shared lock */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + ilock_flags |= BTRFS_ILOCK_SHARED; + +relock: err = btrfs_inode_lock(inode, ilock_flags); if (err < 0) return err; @@ -1928,20 +1932,22 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) } pos = iocb->ki_pos; + /* + * Re-check since file size may have changed just before taking the + * lock or pos may have changed because of O_APPEND in generic_write_check() + */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && + pos + iov_iter_count(from) > i_size_read(inode)) { + btrfs_inode_unlock(inode, ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } if (check_direct_IO(fs_info, from, pos)) { btrfs_inode_unlock(inode, ilock_flags); goto buffered; } - /* - * If the write DIO is beyond EOF, we need to update the isize, but it - * is protected by inode lock. So we cannot unlock it here. - */ - if (pos + iov_iter_count(from) <= inode->i_size) { - btrfs_inode_unlock(inode, 0); - relock = true; - } down_read(&BTRFS_I(inode)->dio_sem); /* @@ -1959,8 +1965,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) written = 0; up_read(&BTRFS_I(inode)->dio_sem); - if (relock) - btrfs_inode_lock(inode, 0); + btrfs_inode_unlock(inode, ilock_flags); if (written < 0 || !iov_iter_count(from)) { err = written; -- cgit v1.2.3 From 502756b380938022c848761837f8fa3976906aa1 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:19 -0500 Subject: btrfs: remove btrfs_inode::dio_sem The inode dio_sem can be eliminated because all DIO synchronization is now performed through inode->i_rwsem that provides the same guarantees. This reduces btrfs_inode size by 40 bytes. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 10 ---------- fs/btrfs/file.c | 13 ------------- fs/btrfs/inode.c | 1 - 3 files changed, 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 92dd86bceae3..16c50a438f70 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -203,16 +203,6 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; - /* - * To avoid races between lockless (i_mutex not held) direct IO writes - * and concurrent fsync requests. Direct IO writes must acquire read - * access on this semaphore for creating an extent map and its - * corresponding ordered extent. The fast fsync path must acquire write - * access on this semaphore before it collects ordered extents and - * extent maps. - */ - struct rw_semaphore dio_sem; - struct inode vfs_inode; }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc54b2b38d14..93191ea5d1fd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1948,8 +1948,6 @@ relock: goto buffered; } - down_read(&BTRFS_I(inode)->dio_sem); - /* * This is actually a sync iocb, so we need our fancy endio to know if * we need to sync. @@ -1964,7 +1962,6 @@ relock: if (written == -ENOTBLK) written = 0; - up_read(&BTRFS_I(inode)->dio_sem); btrfs_inode_unlock(inode, ilock_flags); if (written < 0 || !iov_iter_count(from)) { @@ -2172,13 +2169,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); - /* - * We take the dio_sem here because the tree log stuff can race with - * lockless dio writes and get an extent map logged for an extent we - * never waited on. We need it this high up for lockdep reasons. - */ - down_write(&BTRFS_I(inode)->dio_sem); - atomic_inc(&root->log_batch); /* @@ -2209,7 +2199,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2306,7 +2295,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); if (ret != BTRFS_NO_LOG_SYNC) { @@ -2337,7 +2325,6 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3eea0d0be8a0..c4e00027f0f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8650,7 +8650,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); - init_rwsem(&ei->dio_sem); return inode; } -- cgit v1.2.3 From a42fa643169d2325602572633fcaa16862990e28 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:20 -0500 Subject: btrfs: call iomap_dio_complete() without inode_lock If direct writes are called with O_DIRECT | O_DSYNC, it will result in a deadlock because iomap_dio_rw() is called under i_rwsem which calls: iomap_dio_complete() generic_write_sync() btrfs_sync_file() btrfs_sync_file() requires i_rwsem, so call __iomap_dio_rw() with the i_rwsem locked, and call iomap_dio_complete() after unlocking i_rwsem. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/file.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 93191ea5d1fd..40b7886c23f5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1906,6 +1906,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; + struct iomap_dio *dio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1948,22 +1949,19 @@ relock: goto buffered; } - /* - * This is actually a sync iocb, so we need our fancy endio to know if - * we need to sync. - */ - if (current->journal_info) - written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, - &btrfs_sync_dops, is_sync_kiocb(iocb)); - else - written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, - &btrfs_dio_ops, is_sync_kiocb(iocb)); - - if (written == -ENOTBLK) - written = 0; + dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); btrfs_inode_unlock(inode, ilock_flags); + if (IS_ERR_OR_NULL(dio)) { + err = PTR_ERR_OR_ZERO(dio); + if (err < 0 && err != -ENOTBLK) + goto out; + } else { + written = iomap_dio_complete(dio); + } + if (written < 0 || !iov_iter_count(from)) { err = written; goto out; -- cgit v1.2.3 From ecfdc08b8cc65d737eebc26a1ee1875a097fd6a0 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:21 -0500 Subject: btrfs: remove dio iomap DSYNC workaround This effectively reverts 09745ff88d93 ("btrfs: dio iomap DSYNC workaround") now that the iomap API has been updated to allow iomap_dio_complete() not to be called under i_rwsem anymore. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/file.c | 38 ++------------------------------------ fs/btrfs/inode.c | 50 -------------------------------------------------- fs/btrfs/transaction.h | 1 - 4 files changed, 2 insertions(+), 88 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ee7252f51c0..d9188ecf863d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3068,7 +3068,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; -extern const struct iomap_dio_ops btrfs_sync_dops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 40b7886c23f5..5b93f0bb8654 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2018,44 +2018,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * 1. We must always clear IOCB_DSYNC in order to not deadlock - * in iomap, as it calls generic_write_sync() in this case. - * 2. If we are async, we can call iomap_dio_complete() either - * in - * - * 2.1. A worker thread from the last bio completed. In this - * case we need to mark the btrfs_dio_data that it is - * async in order to call generic_write_sync() properly. - * This is handled by setting BTRFS_DIO_SYNC_STUB in the - * current->journal_info. - * 2.2 The submitter context, because all IO completed - * before we exited iomap_dio_rw(). In this case we can - * just re-set the IOCB_DSYNC on the iocb and we'll do - * the sync below. If our ->end_io() gets called and - * current->journal_info is set, then we know we're in - * our current context and we will clear - * current->journal_info to indicate that we need to - * sync below. - */ - if (sync) { - ASSERT(current->journal_info == NULL); - iocb->ki_flags &= ~IOCB_DSYNC; - current->journal_info = BTRFS_DIO_SYNC_STUB; - } + if (iocb->ki_flags & IOCB_DIRECT) num_written = btrfs_direct_write(iocb, from); - - /* - * As stated above, we cleared journal_info, so we need to do - * the sync ourselves. - */ - if (sync && current->journal_info == NULL) - iocb->ki_flags |= IOCB_DSYNC; - current->journal_info = NULL; - } else { + else num_written = btrfs_buffered_write(iocb, from); - } /* * We also have to set last_sub_trans to the current log transid, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4e00027f0f8..9c0bf692a362 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,7 +62,6 @@ struct btrfs_dio_data { loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; - bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7431,17 +7430,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; bool unlock_extents = false; - bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); - - /* - * We used current->journal_info here to see if we were sync, but - * there's a lot of tests in the enospc machinery to not do flushing if - * we have a journal_info set, so we need to clear this out and re-set - * it in iomap_end. - */ - ASSERT(current->journal_info == NULL || - current->journal_info == BTRFS_DIO_SYNC_STUB); - current->journal_info = NULL; if (!write) len = min_t(u64, len, fs_info->sectorsize); @@ -7467,7 +7455,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->sync = sync; dio_data->length = length; if (write) { dio_data->reserve = round_up(length, fs_info->sectorsize); @@ -7615,14 +7602,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, extent_changeset_free(dio_data->data_reserved); } out: - /* - * We're all done, we can re-set the current->journal_info now safely - * for our endio. - */ - if (dio_data->sync) { - ASSERT(current->journal_info == NULL); - current->journal_info = BTRFS_DIO_SYNC_STUB; - } kfree(dio_data); iomap->private = NULL; @@ -7999,30 +7978,6 @@ out_err: return BLK_QC_T_NONE; } -static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned flags) -{ - /* - * Now if we're still in the context of our submitter we know we can't - * safely run generic_write_sync(), so clear our flag here so that the - * caller knows to follow up with a sync. - */ - if (current->journal_info == BTRFS_DIO_SYNC_STUB) { - current->journal_info = NULL; - return error; - } - - if (error) - return error; - - if (size) { - iocb->ki_flags |= IOCB_DSYNC; - return generic_write_sync(iocb, size); - } - - return 0; -} - const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, @@ -8032,11 +7987,6 @@ const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; -const struct iomap_dio_ops btrfs_sync_dops = { - .submit_io = btrfs_submit_direct, - .end_io = btrfs_maybe_fsync_end_io, -}; - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 858d9153a1cd..8241c050ba71 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -112,7 +112,6 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) -#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; -- cgit v1.2.3 From 387824afd7210376a577b3c3e2f74618e3ef43a4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Oct 2020 15:54:08 +0100 Subject: btrfs: use the right number of levels for lockdep keysets BTRFS_MAX_LEVEL is 8 and the keyset table is supposed to have a key for each level, but we'll never have more than 8 levels. The values passed to btrfs_set_buffer_lockdep_class are always derived from a valid extent buffer. Set the array sizes to the right value. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 646dde5883a6..c524be53279e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -153,8 +153,8 @@ struct async_submit_bio { static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ const char *name_stem; /* lock name stem */ - char names[BTRFS_MAX_LEVEL + 1][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; + char names[BTRFS_MAX_LEVEL][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, -- cgit v1.2.3 From ab1405aa2522ae61958fc874432bce24d48942b1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 29 Sep 2020 14:56:39 +0200 Subject: btrfs: generate lockdep keyset names at compile time The names in btrfs_lockdep_keysets are generated from a simple pattern using snprintf but we can generate them directly with some macro magic and remove the helpers. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 56 ++++++++++++++++++++++++++++-------------------------- fs/btrfs/disk-io.h | 3 --- fs/btrfs/super.c | 2 -- 3 files changed, 29 insertions(+), 32 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c524be53279e..142bd84d028a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -150,40 +150,42 @@ struct async_submit_bio { # error # endif +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - const char *name_stem; /* lock name stem */ + /* Longest entry: btrfs-free-space-00 */ char names[BTRFS_MAX_LEVEL][20]; struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, - { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, - { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, - { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, - { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, - { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, - { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, - { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, - { .id = 0, .name_stem = "tree" }, + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_FS_TREE_OBJECTID, DEFINE_NAME("fs") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = 0, DEFINE_NAME("tree") }, }; -void __init btrfs_init_lockdep(void) -{ - int i, j; - - /* initialize lockdep class names */ - for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { - struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; - - for (j = 0; j < ARRAY_SIZE(ks->names); j++) - snprintf(ks->names[j], sizeof(ks->names[j]), - "btrfs-%s-%02d", ks->name_stem, j); - } -} +#undef DEFINE_LEVEL +#undef DEFINE_NAME void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 182540bdcea0..d47894260cfa 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -135,12 +135,9 @@ int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_init_lockdep(void); void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); #else -static inline void btrfs_init_lockdep(void) -{ } static inline void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1ffa50bae1dd..9f51b0a22b14 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2572,8 +2572,6 @@ static int __init init_btrfs_fs(void) if (err) goto free_end_io_wq; - btrfs_init_lockdep(); - btrfs_print_mod_info(); err = btrfs_run_sanity_tests(); -- cgit v1.2.3 From 09e3a28892a9598260fc44dc9be6b8cc3f56ded1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 14:30:15 +0200 Subject: btrfs: send: use helpers to access root_item::ctransid We have helpers to access the on-disk item members, use that for root_item::ctransid instead of raw le64_to_cpu. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 340c76a12ce1..d719a2755a40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2410,7 +2410,7 @@ static int send_subvol_begin(struct send_ctx *sctx) sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - le64_to_cpu(sctx->send_root->root_item.ctransid)); + btrfs_root_ctransid(&sctx->send_root->root_item)); if (parent_root) { if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, @@ -2419,7 +2419,7 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - le64_to_cpu(sctx->parent_root->root_item.ctransid)); + btrfs_root_ctransid(&sctx->parent_root->root_item)); } ret = send_cmd(sctx); @@ -5101,7 +5101,7 @@ static int send_clone(struct send_ctx *sctx, TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - le64_to_cpu(clone_root->root->root_item.ctransid)); + btrfs_root_ctransid(&clone_root->root->root_item)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); -- cgit v1.2.3 From 3b5418fba372c28511ebe9f83b5e26e8ab34b9f9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 14:26:27 +0200 Subject: btrfs: check-integrity: use proper helper to access btrfs_header There's one raw use of le->cpu conversion but we have a helper to do that for us, so use it. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 81a8c87a5afb..106874c959a0 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -954,7 +954,7 @@ static noinline_for_stack int btrfsic_process_metablock( sf->prev = NULL; continue_with_new_stack_frame: - sf->block->generation = le64_to_cpu(sf->hdr->generation); + sf->block->generation = btrfs_stack_header_generation(sf->hdr); if (0 == sf->hdr->level) { struct btrfs_leaf *const leafhdr = (struct btrfs_leaf *)sf->hdr; -- cgit v1.2.3 From f944d2cb209674411280737d5fb660e98b8a8314 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 21:00:04 +0200 Subject: btrfs: use root_item helpers for limit and flags in btrfs_create_tree For consistency use the available helpers to set flags and limit. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 142bd84d028a..cf01f97a54cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1166,8 +1166,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - root->root_item.flags = 0; - root->root_item.byte_limit = 0; + btrfs_set_root_flags(&root->root_item, 0); + btrfs_set_root_limit(&root->root_item, 0); btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); -- cgit v1.2.3 From c842268458d904c04f08259cc7ecee3870fa2746 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 21:44:52 +0200 Subject: btrfs: add set/get accessors for root_item::drop_level The drop_level member is used directly unlike all the other int types in root_item. Add the definition and use it everywhere. The type is u8 so there's no conversion necessary and the helpers are properly inlined, this is for consistency. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/inode.c | 2 +- fs/btrfs/relocation.c | 10 +++++----- fs/btrfs/tree-checker.c | 4 ++-- 6 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d9188ecf863d..b1b6d9a69b99 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2089,6 +2089,7 @@ BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cf01f97a54cb..bd01e2cc165b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1179,7 +1179,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); - root->root_item.drop_level = 0; + btrfs_set_root_drop_level(&root->root_item, 0); key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5fd60b13f4f8..bac1eb17e498 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5496,7 +5496,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) memcpy(&wc->update_progress, &key, sizeof(wc->update_progress)); - level = root_item->drop_level; + level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5529,7 +5529,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } BUG_ON(wc->refs[level] == 0); - if (level == root_item->drop_level) + if (level == btrfs_root_drop_level(root_item)) break; btrfs_tree_unlock(path->nodes[level]); @@ -5574,7 +5574,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } btrfs_cpu_key_to_disk(&root_item->drop_progress, &wc->drop_progress); - root_item->drop_level = wc->drop_level; + btrfs_set_root_drop_level(root_item, wc->drop_level); BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9c0bf692a362..2c97169b6e26 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4105,7 +4105,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); - dest->root_item.drop_level = 0; + btrfs_set_root_drop_level(&dest->root_item, 0); btrfs_set_root_refs(&dest->root_item, 0); if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9ba92d86da0b..3d4618a01ef3 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -783,7 +783,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + btrfs_set_root_drop_level(root_item, 0); } btrfs_tree_unlock(eb); @@ -1575,7 +1575,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, reloc_root_item = &reloc_root->root_item; memset(&reloc_root_item->drop_progress, 0, sizeof(reloc_root_item->drop_progress)); - reloc_root_item->drop_level = 0; + btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); btrfs_update_reloc_root(trans, root); @@ -1672,7 +1672,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - level = root_item->drop_level; + level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); @@ -1769,7 +1769,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, */ btrfs_node_key(path->nodes[level], &root_item->drop_progress, path->slots[level]); - root_item->drop_level = level; + btrfs_set_root_drop_level(root_item, level); btrfs_end_transaction_throttle(trans); trans = NULL; @@ -3694,7 +3694,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); - root->root_item.drop_level = 0; + btrfs_set_root_drop_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ea2bb4cb5890..cb9041da7d87 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1118,10 +1118,10 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } - if (ri.drop_level >= BTRFS_MAX_LEVEL) { + if (btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", - ri.drop_level, BTRFS_MAX_LEVEL - 1); + btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } -- cgit v1.2.3 From cc7c77146e53ecfbdce695a860e8368c849ffd4f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 14:18:23 +0200 Subject: btrfs: remove unnecessary casts in printk Long time ago the explicit casts were necessary for u64 but we don't need it. Remove casts where the type matches, leaving only cases that cast sector_t or loff_t. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 3 +-- fs/btrfs/ref-verify.c | 3 +-- fs/btrfs/uuid-tree.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 48b57dd57436..8db99117531d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -177,8 +177,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, __le64 subvol_id; read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); - pr_info("\t\tsubvol_id %llu\n", - (unsigned long long)le64_to_cpu(subvol_id)); + pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id)); item_size -= sizeof(u64); offset += sizeof(u64); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 78693d3dd15b..14b0c0dcb08c 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -799,8 +799,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (!be) { btrfs_err(fs_info, "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", - action, (unsigned long long)bytenr, - (unsigned long long)num_bytes); + action, bytenr, num_bytes); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 28525ad7ff8c..74023c8a783f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -129,8 +129,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", - ret, (unsigned long long)key.objectid, - (unsigned long long)key.offset, type); + ret, key.objectid, key.offset, type); goto out; } -- cgit v1.2.3 From a3efb2f0bad55330bc2402734220ef5446ef1f19 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:49 +0800 Subject: btrfs: fix the comment on lock_extent_buffer_for_io The return value of that function is completely wrong. That function only returns 0 if the extent buffer doesn't need to be submitted. The "ret = 1" and "ret = 0" are determined by the return value of "test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)". And if we get ret == 1, it's because the extent buffer is dirty, and we set its status to EXTENT_BUFFER_WRITE_BACK, and continue to page locking. While if we get ret == 0, it means the extent is not dirty from the beginning, so we don't need to write it back. The caller also follows this, in btree_write_cache_pages(), if lock_extent_buffer_for_io() returns 0, we just skip the extent buffer completely. So the comment is completely wrong. Since we're here, also change the description a little. The write bio flushing won't be visible to the caller, thus it's not an major feature. In the main description, only describe the locking part to make the point more clear. For reference, added in commit 2e3c25136adf ("btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io()") Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf3f576d6277..a268cc6923e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3652,11 +3652,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) } /* - * Lock eb pages and flush the bio if we can't the locks + * Lock extent buffer status and pages for writeback. * - * Return 0 if nothing went wrong - * Return >0 is same as 0, except bio is not submitted - * Return <0 if something went wrong, no page is locked + * May try to flush write bio if we can't get the lock. + * + * Return 0 if the extent buffer doesn't need to be submitted. + * (E.g. the extent buffer is not dirty) + * Return >0 is the extent buffer is submitted to bio. + * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) -- cgit v1.2.3 From 03509b781ae98d4cf5cc139f89a3e75467b829a8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:50 +0800 Subject: btrfs: update the comment for find_first_extent_bit The pitfall here is, if the parameter @bits has multiple bits set, we will return the first range which just has one of the specified bits set. This is a little tricky if we want an exact match. Anyway, update the comment to make that clear. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a268cc6923e0..539dab345d24 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1554,11 +1554,12 @@ out: } /* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. + * Find the first offset in the io tree with one or more @bits set. * - * If nothing was found, 1 is returned. If found something, return 0. + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, -- cgit v1.2.3 From 3f6bb4aeb5dfa392dac438e816959ccb9c690896 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:51 +0800 Subject: btrfs: sink the failed_start parameter to set_extent_bit The @failed_start parameter is only paired with @exclusive_bits, and those parameters are only used for EXTENT_LOCKED bit, which have their own wrappers lock_extent_bits(). Thus for regular set_extent_bit() calls, the failed_start makes no sense, just sink the parameter. Also, since @failed_start and @exclusive_bits are used in pairs, add an assert to make it obvious. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 17 +++++++---------- fs/btrfs/extent_io.c | 11 +++++++---- fs/btrfs/inode.c | 3 +-- 3 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9800a8306368..cab4273ff8d3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -154,15 +154,14 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask); + unsigned bits, struct extent_state **cached_state, gfp_t mask); int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -175,8 +174,7 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -197,7 +195,7 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - NULL, cached_state, GFP_NOFS); + cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, @@ -205,20 +203,19 @@ static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, GFP_NOFS); + cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, - GFP_NOFS); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, cached_state, mask); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 539dab345d24..de675c74712b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -980,6 +980,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL); again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1180,11 +1184,10 @@ out: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 * failed_start, - struct extent_state **cached_state, gfp_t mask) + unsigned bits, struct extent_state **cached_state, gfp_t mask) { - return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask, NULL); + return __set_extent_bit(tree, start, end, bits, 0, NULL, cached_state, + mask, NULL); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2c97169b6e26..ad72f68573f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4717,8 +4717,7 @@ again: if (only_release_metadata) set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, - block_end, EXTENT_NORESERVE, NULL, NULL, - GFP_NOFS); + block_end, EXTENT_NORESERVE, NULL, GFP_NOFS); out_unlock: if (ret) { -- cgit v1.2.3 From 8896a08d8ea95809adbc3742cdf9c7575f66c960 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:53 +0800 Subject: btrfs: replace fs_info and private_data with inode in btrfs_wq_submit_bio All callers of btrfs_wq_submit_bio() pass struct inode as @private_data, so there is no need for it to be (void *), replace it with "struct inode *inode". While we can extract fs_info from struct inode, also remove the @fs_info parameter. Since we're here, also replace all the (void *private_data) into (struct inode *inode). Reviewed-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 22 +++++++++++----------- fs/btrfs/disk-io.h | 8 ++++---- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 20 ++++++++------------ 4 files changed, 24 insertions(+), 28 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd01e2cc165b..ad526aa5312a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -109,7 +109,7 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - void *private_data; + struct inode *inode; struct bio *bio; extent_submit_bio_start_t *submit_bio_start; int mirror_num; @@ -696,8 +696,7 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->private_data, async->bio, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->bio, async->bio_offset); if (ret) async->status = ret; } @@ -717,7 +716,7 @@ static void run_one_async_done(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - inode = async->private_data; + inode = async->inode; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -747,18 +746,19 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, +blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, - u64 bio_offset, void *private_data, + u64 bio_offset, extent_submit_bio_start_t *submit_bio_start) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return BLK_STS_RESOURCE; - async->private_data = private_data; + async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -795,8 +795,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, - u64 bio_offset) +static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -842,8 +842,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, - 0, inode, btree_submit_bio_start); + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + 0, btree_submit_bio_start); } if (ret) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d47894260cfa..238b45223f2e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -112,10 +112,10 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset, void *private_data, - extent_submit_bio_start_t *submit_bio_start); +blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3801eb3b726e..5403354de0e1 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -71,7 +71,7 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, +typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 bio_offset); #define INLINE_EXTENT_BUFFER_PAGES 16 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ad72f68573f1..48ebf492f11f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2194,11 +2194,9 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 bio_offset) { - struct inode *inode = private_data; - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } @@ -2263,8 +2261,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, - 0, inode, btrfs_submit_bio_start); + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, + 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); @@ -7747,11 +7745,9 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, } } -static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, - struct bio *bio, u64 offset) +static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, u64 offset) { - struct inode *inode = private_data; - return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); } @@ -7802,8 +7798,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, - file_offset, inode, + ret = btrfs_wq_submit_bio(inode, bio, 0, 0, + file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { -- cgit v1.2.3 From 265d4ac03fdf75e84002d5749854c77d7240aa81 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:54 +0800 Subject: btrfs: sink parameter start and len to check_data_csum For check_data_csum(), the page we're using is directly from the inode mapping, thus it has valid page_offset(). We can use (page_offset() + pg_off) to replace @start parameter completely, while the @len should always be sectorsize. Since we're here, also add some comment, as there are quite some confusion in words like start/offset, without explaining whether it's file_offset or logical bytenr. This should not affect the existing behavior, as for current sectorsize == PAGE_SIZE case, @pgoff should always be 0, and len is always PAGE_SIZE (or sectorsize from the dio read path). Reviewed-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48ebf492f11f..8c77b1d32d8e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2889,17 +2889,29 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, btrfs_queue_work(wq, &ordered_extent->work); } +/* + * check_data_csum - verify checksum of one sector of uncompressed data + * @inode: the inode + * @io_bio: btrfs_io_bio which contains the csum + * @icsum: checksum index in the io_bio->csum array, size of csum_size + * @page: page where is the data to be verified + * @pgoff: offset inside the page + * + * The length of such check is always one sector size. + */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - int icsum, struct page *page, int pgoff, u64 start, - size_t len) + int icsum, struct page *page, int pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; + u32 len = fs_info->sectorsize; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + ASSERT(pgoff + len <= PAGE_SIZE); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); @@ -2913,8 +2925,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, + csum, csum_expected, io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -2955,8 +2967,7 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, } phy_offset >>= inode->i_sb->s_blocksize_bits; - return check_data_csum(inode, io_bio, phy_offset, page, offset, start, - (size_t)(end - start + 1)); + return check_data_csum(inode, io_bio, phy_offset, page, offset); } /* @@ -7674,8 +7685,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, io_bio, icsum, - bvec.bv_page, pgoff, - start, sectorsize))) { + bvec.bv_page, pgoff))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), -- cgit v1.2.3 From 12e3360f74759de1031738b85743b429aef3dc8c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:57 +0800 Subject: btrfs: rename pages_locked in process_pages_contig() Function process_pages_contig() does not only handle page locking but also other operations. Rename the local variable pages_locked to pages_processed to reduce confusion. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index de675c74712b..988ca397de8d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1950,7 +1950,7 @@ static int __process_pages_contig(struct address_space *mapping, unsigned long page_ops, pgoff_t *index_ret) { unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_locked = 0; + unsigned long pages_processed = 0; pgoff_t index = start_index; struct page *pages[16]; unsigned ret; @@ -1985,7 +1985,7 @@ static int __process_pages_contig(struct address_space *mapping, if (locked_page && pages[i] == locked_page) { put_page(pages[i]); - pages_locked++; + pages_processed++; continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -2010,7 +2010,7 @@ static int __process_pages_contig(struct address_space *mapping, } } put_page(pages[i]); - pages_locked++; + pages_processed++; } nr_pages -= ret; index += ret; @@ -2018,7 +2018,7 @@ static int __process_pages_contig(struct address_space *mapping, } out: if (err && index_ret) - *index_ret = start_index + pages_locked - 1; + *index_ret = start_index + pages_processed - 1; return err; } -- cgit v1.2.3 From 8b8bbd461ea180470041fa84c745480163bb908f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:24:58 +0800 Subject: btrfs: only require sector size alignment for page read If we're reading partial page, btrfs will warn about this as read/write is always done in sector size, which now equals page size. But for the upcoming subpage read-only support, our data read is only aligned to sectorsize, which can be smaller than page size. Thus here we change the warning condition to check it against sectorsize, the behavior is not changed for regular sectorsize == PAGE_SIZE case, and won't report error for subpage read. Also, pass the proper start/end with bv_offset for check_data_csum() to handle. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 988ca397de8d..776f702ca144 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2823,6 +2823,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u32 sectorsize = fs_info->sectorsize; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2831,24 +2832,25 @@ static void end_bio_extent_readpage(struct bio *bio) tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } - - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + /* + * We always issue full-sector reads, but if some block in a + * page fails to read, blk_update_request() will advance + * bv_offset and adjust bv_len to compensate. Print a warning + * for unaligned offsets, and an error if they don't add up to + * a full sector. + */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, + sectorsize)) + btrfs_info(fs_info, + "incomplete page read with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; len = bvec->bv_len; mirror = io_bio->mirror_num; -- cgit v1.2.3 From e940e9a7c793e3fffe6cdef4f849d696c57ed3f7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:25:01 +0800 Subject: btrfs: rename page_size to io_size in submit_extent_page The variable @page_size in submit_extent_page() is not related to page size. It can already be smaller than PAGE_SIZE, so rename it to io_size to reduce confusion, this is especially important for later subpage support. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 776f702ca144..e8a72cecf372 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3044,7 +3044,7 @@ static int submit_extent_page(unsigned int opf, { int ret = 0; struct bio *bio; - size_t page_size = min_t(size_t, size, PAGE_SIZE); + size_t io_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -3060,12 +3060,12 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) + if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || force_bio_submit || - bio_add_page(bio, page, page_size, pg_offset) < page_size) { + bio_add_page(bio, page, io_size, pg_offset) < io_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -3074,13 +3074,13 @@ static int submit_extent_page(unsigned int opf, bio = NULL; } else { if (wbc) - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); return 0; } } bio = btrfs_bio_alloc(offset); - bio_add_page(bio, page, page_size, pg_offset); + bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -3091,7 +3091,7 @@ static int submit_extent_page(unsigned int opf, bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); } *bio_ret = bio; -- cgit v1.2.3 From ab108d992b1248adfb7c13c1136cab59c944a98c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jul 2020 20:45:04 +0200 Subject: btrfs: use precalculated sectorsize_bits from fs_info We do a lot of calculations where we divide or multiply by sectorsize. We also know and make sure that sectorsize is a power of two, so this means all divisions can be turned to shifts and avoid eg. expensive u64/u32 divisions. The type is u32 as it's more register friendly on x86_64 compared to u8 and the resulting assembly is smaller (movzbl vs movl). There's also superblock s_blocksize_bits but it's usually one more pointer dereference farther than fs_info. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 2 +- fs/btrfs/file-item.c | 2 +- fs/btrfs/free-space-tree.c | 6 +++--- fs/btrfs/ordered-data.c | 3 +-- fs/btrfs/scrub.c | 12 ++++++------ fs/btrfs/tests/btrfs-tests.c | 1 + 8 files changed, 17 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1b6d9a69b99..2bfad194bf06 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -935,6 +935,8 @@ struct btrfs_fs_info { /* Cached block sizes */ u32 nodesize; u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; u32 stripesize; /* Block groups and devices containing active swapfiles. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ad526aa5312a..25889a2362e2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2814,6 +2814,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; + fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -3078,6 +3079,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* Cache block sizes */ fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->stripesize = stripesize; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bac1eb17e498..14d5aaa28399 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2145,7 +2145,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(fs_info->super_copy)); - num_csums = div64_u64(csum_bytes, fs_info->sectorsize); + num_csums = csum_bytes >> fs_info->sectorsize_bits; num_csums += num_csums_per_leaf - 1; num_csums = div64_u64(num_csums, num_csums_per_leaf); return num_csums; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 816f57d52fc9..81a318a92418 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -369,7 +369,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, * a single leaf so it will also fit inside a u32 */ diff = disk_bytenr - item_start_offset; - diff = diff / fs_info->sectorsize; + diff = diff >> fs_info->sectorsize_bits; diff = diff * csum_size; count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> inode->i_sb->s_blocksize_bits); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 6b9faf3b0e96..84eb1a3ffaca 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -416,7 +416,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize); + nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { @@ -540,8 +540,8 @@ static void free_space_set_bits(struct btrfs_block_group *block_group, end = found_end; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - first = div_u64(*start - found_start, fs_info->sectorsize); - last = div_u64(end - found_start, fs_info->sectorsize); + first = (*start - found_start) >> fs_info->sectorsize_bits; + last = (end - found_start) >> fs_info->sectorsize_bits; if (bit) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 87bac9ecdf4c..c6534daa40cd 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -868,7 +868,6 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(inode); const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; @@ -890,7 +889,7 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, index += (int)num_sectors * csum_size; if (index == len) goto out; - disk_bytenr += num_sectors * sectorsize; + disk_bytenr += num_sectors * fs_info->sectorsize; } } out: diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 04351b55dd14..c307f25c4846 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2300,7 +2300,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, u64 offset; u64 nsectors64; u32 nsectors; - int sectorsize = sparity->sctx->fs_info->sectorsize; + u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; if (len >= sparity->stripe_len) { bitmap_set(bitmap, 0, sparity->nsectors); @@ -2309,8 +2309,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, start -= sparity->logic_start; start = div64_u64_rem(start, sparity->stripe_len, &offset); - offset = div_u64(offset, sectorsize); - nsectors64 = div_u64(len, sectorsize); + offset = offset >> sectorsize_bits; + nsectors64 = len >> sectorsize_bits; ASSERT(nsectors64 < UINT_MAX); nsectors = (u32)nsectors64; @@ -2386,10 +2386,10 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize); + index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; ASSERT(index < UINT_MAX); - num_sectors = sum->len / sctx->fs_info->sectorsize; + num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); @@ -2776,7 +2776,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = div_u64(map->stripe_len, fs_info->sectorsize); + nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 999c14e5d0bd..b36e88753f1f 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -134,6 +134,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; -- cgit v1.2.3 From 098e63082b9bd26b61a57310385efc3e9f363dea Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jul 2020 21:07:40 +0200 Subject: btrfs: replace div_u64 by shift in free_space_bitmap_size Change free_space_bitmap_size to take btrfs_fs_info so we can get the sectorsize_bits to do calculations. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 84eb1a3ffaca..37d210148c1d 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -136,9 +136,10 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return 0; } -static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info, + u64 size) { - return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); + return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE); } static unsigned long *alloc_bitmap(u32 bitmap_size) @@ -200,8 +201,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->length, - fs_info->sectorsize); + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -290,8 +290,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, u32 data_size; extent_size = min(end - i, bitmap_range); - data_size = free_space_bitmap_size(extent_size, - fs_info->sectorsize); + data_size = free_space_bitmap_size(fs_info, extent_size); key.objectid = i; key.type = BTRFS_FREE_SPACE_BITMAP_KEY; @@ -339,8 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->length, - fs_info->sectorsize); + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -383,8 +381,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, fs_info->sectorsize * BITS_PER_BYTE); bitmap_cursor = ((char *)bitmap) + bitmap_pos; - data_size = free_space_bitmap_size(found_key.offset, - fs_info->sectorsize); + data_size = free_space_bitmap_size(fs_info, + found_key.offset); ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); read_extent_buffer(leaf, bitmap_cursor, ptr, -- cgit v1.2.3 From 265fdfa6ce0a79df3b880bbf39d9a00a0435687f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jul 2020 21:19:09 +0200 Subject: btrfs: replace s_blocksize_bits with fs_info::sectorsize_bits The value of super_block::s_blocksize_bits is the same as fs_info::sectorsize_bits, but we don't need to do the extra dereferences in many functions and storing the bits as u32 (in fs_info) generates shorter assembly. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 34 +++++++++++++++------------------- fs/btrfs/file.c | 3 +-- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/super.c | 2 +- 7 files changed, 25 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2bfad194bf06..b94b4fe0fa96 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1410,7 +1410,7 @@ struct btrfs_map_token { }; #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sb->s_blocksize_bits) + ((bytes) >> (fs_info)->sectorsize_bits) static inline void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e8a72cecf372..1eeba31b551d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2656,7 +2656,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); - const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + const int icsum = phy_offset >> fs_info->sectorsize_bits; bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 81a318a92418..5d79b1f79287 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -201,7 +201,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + fs_info->sectorsize_bits; csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; @@ -279,7 +279,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (!path) return BLK_STS_RESOURCE; - nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + nblocks = bio->bi_iter.bi_size >> fs_info->sectorsize_bits; if (!dst) { struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); @@ -372,7 +372,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, diff = diff >> fs_info->sectorsize_bits; diff = diff * csum_size; count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> - inode->i_sb->s_blocksize_bits); + fs_info->sectorsize_bits); read_extent_buffer(path->nodes[0], csum, ((unsigned long)item) + diff, csum_size * count); @@ -436,8 +436,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> - fs_info->sb->s_blocksize_bits; + offset = (start - key.offset) >> fs_info->sectorsize_bits; if (offset * csum_size < btrfs_item_size_nr(leaf, path->slots[0] - 1)) path->slots[0]--; @@ -487,10 +486,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> - fs_info->sb->s_blocksize_bits; + offset = (start - key.offset) >> fs_info->sectorsize_bits; offset *= csum_size; - size >>= fs_info->sb->s_blocksize_bits; + size >>= fs_info->sectorsize_bits; read_extent_buffer(path->nodes[0], sums->sums, @@ -643,11 +641,11 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; - u32 blocksize_bits = fs_info->sb->s_blocksize_bits; + u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; - csum_end <<= fs_info->sb->s_blocksize_bits; + csum_end <<= blocksize_bits; csum_end += key->offset; if (key->offset < bytenr && csum_end <= end_byte) { @@ -695,7 +693,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - int blocksize_bits = fs_info->sb->s_blocksize_bits; + u32 blocksize_bits = fs_info->sectorsize_bits; ASSERT(root == fs_info->csum_root || root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -924,7 +922,7 @@ again: if (btrfs_leaf_free_space(leaf) >= csum_size) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + fs_info->sectorsize_bits; goto extend_csum; } @@ -942,8 +940,7 @@ again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || @@ -959,7 +956,7 @@ extend_csum: u32 diff; tmp = sums->len - total_bytes; - tmp >>= fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); extend_nr = max_t(int, 1, (int)tmp); @@ -984,9 +981,9 @@ insert: u64 tmp; tmp = sums->len - total_bytes; - tmp >>= fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sectorsize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> - fs_info->sb->s_blocksize_bits); + fs_info->sectorsize_bits); tmp = max_t(u64, 1, tmp); tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size)); @@ -1010,8 +1007,7 @@ csum: item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - ins_size = (u32)(sums->len - total_bytes) >> - fs_info->sb->s_blocksize_bits; + ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits; ins_size *= csum_size; ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, ins_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5b93f0bb8654..5abdc7332dfc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1799,8 +1799,7 @@ again: if (num_sectors > dirty_sectors) { /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << - fs_info->sb->s_blocksize_bits; + release_bytes -= dirty_sectors << fs_info->sectorsize_bits; if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8c77b1d32d8e..0b0c751dac85 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2966,7 +2966,7 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, return 0; } - phy_offset >>= inode->i_sb->s_blocksize_bits; + phy_offset >>= root->fs_info->sectorsize_bits; return check_data_csum(inode, io_bio, phy_offset, page, offset); } @@ -7824,7 +7824,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, u64 csum_offset; csum_offset = file_offset - dip->logical_offset; - csum_offset >>= inode->i_sb->s_blocksize_bits; + csum_offset >>= fs_info->sectorsize_bits; csum_offset *= btrfs_super_csum_size(fs_info->super_copy); btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } @@ -7853,7 +7853,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); size_t nblocks; - nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; dip_size += csum_size * nblocks; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c6534daa40cd..78c1f28f3bb6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -868,7 +868,6 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; @@ -880,8 +879,9 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; - num_sectors = ordered_sum->len >> blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> + fs_info->sectorsize_bits; + num_sectors = ordered_sum->len >> fs_info->sectorsize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f51b0a22b14..bdb0b0c5fd64 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2205,7 +2205,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_used = 0; u64 total_free_data = 0; u64 total_free_meta = 0; - int bits = dentry->d_sb->s_blocksize_bits; + u32 bits = fs_info->sectorsize_bits; __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; -- cgit v1.2.3 From 22b6331d961712ac2735423e5a6c04e9d0fd7897 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jul 2020 11:10:18 +0200 Subject: btrfs: store precalculated csum_size in fs_info In many places we need the checksum size and it is inefficient to read it from the raw superblock. Store the value into fs_info, actual use will be in followup patches. The size is u32 as it allows to generate better assembly than with u16. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b94b4fe0fa96..3d06596b8746 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -937,6 +937,7 @@ struct btrfs_fs_info { u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; + u32 csum_size; u32 stripesize; /* Block groups and devices containing active swapfiles. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 25889a2362e2..475bd197c482 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3080,6 +3080,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csum_size = btrfs_super_csum_size(disk_super); fs_info->stripesize = stripesize; /* -- cgit v1.2.3 From fe5ecbe818de38774895305e1f2d48972f1b745f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jul 2020 10:54:11 +0200 Subject: btrfs: precalculate checksums per leaf once btrfs_csum_bytes_to_leaves shows up in system profiles, which makes it a candidate for optimizations. After the 64bit division has been replaced by shift, there's still a calculation done each time the function is called: checksums per leaf. As this is a constant value for the entire filesystem lifetime, we can calculate it once at mount time and reuse. This also allows to reduce the division to 64bit/32bit as we know the constant will always fit the 32bit type. Replace the open-coded rounding up with a macro that internally handles the 64bit division and as it's now a short function, make it static inline (slight code increase, slight stack usage reduction). Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 13 ++++++++++++- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 19 ------------------- 3 files changed, 13 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3d06596b8746..f28960647c01 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -938,6 +938,7 @@ struct btrfs_fs_info { /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; u32 csum_size; + u32 csums_per_leaf; u32 stripesize; /* Block groups and devices containing active swapfiles. */ @@ -2525,7 +2526,17 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); +/* + * Take the number of bytes to be checksummmed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} /* * Use this if we would be adding new items, as we could split nodes as we cow diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 475bd197c482..0e540ddd7d99 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3081,6 +3081,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->csum_size = btrfs_super_csum_size(disk_super); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 14d5aaa28399..a27caa47aa62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2132,25 +2132,6 @@ static u64 find_middle(struct rb_root *root) } #endif -/* - * Takes the number of bytes to be csumm'ed and figures out how many leaves it - * would require to store the csums for that many bytes. - */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - u64 csum_size; - u64 num_csums_per_leaf; - u64 num_csums; - - csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); - num_csums_per_leaf = div64_u64(csum_size, - (u64)btrfs_super_csum_size(fs_info->super_copy)); - num_csums = csum_bytes >> fs_info->sectorsize_bits; - num_csums += num_csums_per_leaf - 1; - num_csums = div64_u64(num_csums, num_csums_per_leaf); - return num_csums; -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be -- cgit v1.2.3 From 55fc29bed8ddb4c3848ecf8cf7133e34c946f223 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Jun 2020 02:01:31 +0200 Subject: btrfs: use cached value of fs_info::csum_size everywhere btrfs_get_16 shows up in the system performance profiles (helper to read 16bit values from on-disk structures). This is partially because of the checksum size that's frequently read along with data reads/writes, other u16 uses are from item size or directory entries. Replace all calls to btrfs_super_csum_size by the cached value from fs_info. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +-- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/compression.c | 6 +++--- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 14 +++++++------- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/tree-checker.c | 2 +- 11 files changed, 23 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 16c50a438f70..a31b7e32dd60 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -331,8 +331,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; - struct btrfs_super_block *sb = root->fs_info->super_copy; - const u16 csum_size = btrfs_super_csum_size(sb); + const u16 csum_size = root->fs_info->csum_size; /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 106874c959a0..288fd4bbe597 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -660,7 +660,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - state->csum_size = btrfs_super_csum_size(selected_super); + state->csum_size = state->fs_info->csum_size; for (pass = 0; pass < 3; pass++) { int num_copies; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 972fb68a85ac..00bbd859f31b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -131,7 +131,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; return sizeof(struct compressed_bio) + (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; @@ -142,7 +142,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; struct page *page; unsigned long i; char *kaddr; @@ -628,7 +628,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e540ddd7d99..95ebe7c84dd7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -321,7 +321,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); - if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; return 0; @@ -454,7 +454,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; struct extent_buffer *eb; int ret; @@ -543,7 +543,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, eb = (struct extent_buffer *)page->private; fs_info = eb->fs_info; - csum_size = btrfs_super_csum_size(fs_info->super_copy); + csum_size = fs_info->csum_size; /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1eeba31b551d..d699f5d9b710 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2689,7 +2689,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_bio->bi_private = failed_bio->bi_private; if (failed_io_bio->csum) { - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; repair_io_bio->csum = repair_io_bio->csum_inline; memcpy(repair_io_bio->csum, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5d79b1f79287..90e673847b47 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -181,7 +181,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -270,7 +270,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 diff; int nblocks; int count = 0; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; @@ -409,7 +409,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -540,7 +540,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -638,7 +638,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -692,7 +692,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; ASSERT(root == fs_info->csum_root || @@ -847,7 +847,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, int index = 0; int found_next; int ret; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0b0c751dac85..358d900d0cf5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2906,7 +2906,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; u32 len = fs_info->sectorsize; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -7825,7 +7825,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, csum_offset = file_offset - dip->logical_offset; csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= btrfs_super_csum_size(fs_info->super_copy); + csum_offset *= fs_info->csum_size; btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: @@ -7850,7 +7850,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip_size = sizeof(*dip); if (!write && csum) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; size_t nblocks; nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 78c1f28f3bb6..6a63fb141f23 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -868,7 +868,7 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c3a2325e64a4..4662fd8ca546 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -137,7 +137,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - int csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u16 csum_size = fs_info->csum_size; return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c307f25c4846..51222534618b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -610,7 +610,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); - sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + sctx->csum_size = fs_info->csum_size; spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index cb9041da7d87..1fe5961a9f60 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -336,7 +336,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; - u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); + const u16 csumsize = fs_info->csum_size; if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { generic_err(leaf, slot, -- cgit v1.2.3 From 223486c27b369a10ceb6180c40d7aa354e903446 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jul 2020 11:27:30 +0200 Subject: btrfs: switch cached fs_info::csum_size from u16 to u32 The fs_info value is 32bit, switch also the local u16 variables. This leads to a better assembly code generated due to movzwl. This simple change will shave some bytes on x86_64 and release config: text data bss dec hex filename 1090000 17980 14912 1122892 11224c pre/btrfs.ko 1089794 17980 14912 1122686 11217e post/btrfs.ko DELTA: -206 Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/compression.c | 7 +++---- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 14 +++++++------- fs/btrfs/inode.c | 5 ++--- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/tree-checker.c | 2 +- 9 files changed, 18 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a31b7e32dd60..8494f62f8aa4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -331,7 +331,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; - const u16 csum_size = root->fs_info->csum_size; + const u32 csum_size = root->fs_info->csum_size; /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 00bbd859f31b..b2bb4681975e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -131,8 +131,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) { - const u16 csum_size = fs_info->csum_size; - + const u32 csum_size = fs_info->csum_size; return sizeof(struct compressed_bio) + (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } @@ -142,7 +141,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; struct page *page; unsigned long i; char *kaddr; @@ -628,7 +627,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 95ebe7c84dd7..02b36908327a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -454,7 +454,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; struct extent_buffer *eb; int ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d699f5d9b710..f3515d3c1321 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2689,7 +2689,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_bio->bi_private = failed_bio->bi_private; if (failed_io_bio->csum) { - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; repair_io_bio->csum = repair_io_bio->csum_inline; memcpy(repair_io_bio->csum, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90e673847b47..73ec6c5eadcb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -181,7 +181,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -270,7 +270,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 diff; int nblocks; int count = 0; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; @@ -409,7 +409,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -540,7 +540,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -638,7 +638,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) { struct extent_buffer *leaf; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -692,7 +692,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; ASSERT(root == fs_info->csum_root || @@ -847,7 +847,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, int index = 0; int found_next; int ret; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 358d900d0cf5..d1915049d7e7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2906,7 +2906,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; u32 len = fs_info->sectorsize; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -7850,11 +7850,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip_size = sizeof(*dip); if (!write && csum) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u16 csum_size = fs_info->csum_size; size_t nblocks; nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; - dip_size += csum_size * nblocks; + dip_size += fs_info->csum_size * nblocks; } dip = kzalloc(dip_size, GFP_NOFS); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6a63fb141f23..0d61f9fefc02 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -868,7 +868,7 @@ int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4662fd8ca546..e48810104e4a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -137,7 +137,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - const u16 csum_size = fs_info->csum_size; + const u32 csum_size = fs_info->csum_size; return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1fe5961a9f60..f3f666b343ef 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -336,7 +336,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; - const u16 csumsize = fs_info->csum_size; + const u32 csumsize = fs_info->csum_size; if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { generic_err(leaf, slot, -- cgit v1.2.3 From 713cebfb98915201a43ff4d01b0dbafecd50d8ae Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Jun 2020 18:04:02 +0200 Subject: btrfs: remove unnecessary local variables for checksum size Remove local variable that is then used just once and replace it with fs_info::csum_size. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/compression.c | 6 ++---- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/file-item.c | 3 +-- fs/btrfs/ordered-data.h | 3 +-- 4 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b2bb4681975e..4e022ed72d2f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -131,9 +131,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) { - const u32 csum_size = fs_info->csum_size; return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; + (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; } static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, @@ -627,7 +626,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - const u32 csum_size = fs_info->csum_size; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -727,7 +725,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); - sums += csum_size * nr_sectors; + sums += fs_info->csum_size * nr_sectors; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02b36908327a..c70a52b44ceb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -454,7 +454,6 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; - const u32 csum_size = fs_info->csum_size; struct extent_buffer *eb; int ret; @@ -491,7 +490,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return ret; } - write_extent_buffer(eb, result, 0, csum_size); + write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 73ec6c5eadcb..5f3096ea69af 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -540,7 +540,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; - const u32 csum_size = fs_info->csum_size; nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -608,7 +607,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, fs_info->sectorsize, sums->sums + index); kunmap_atomic(data); - index += csum_size; + index += fs_info->csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e48810104e4a..367269effd6a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -137,9 +137,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - const u32 csum_size = fs_info->csum_size; - return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size; + return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; } static inline void -- cgit v1.2.3 From 419b791ce76090aeaa598d7879579c236736e4ae Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Jun 2020 17:42:23 +0200 Subject: btrfs: check integrity: remove local copy of csum_size The state structure unnecessarily stores copy of the checksum size, that can be now easily obtained from fs_info. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 288fd4bbe597..e90e92e4d0b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -233,7 +233,6 @@ struct btrfsic_stack_frame { struct btrfsic_state { u32 print_mask; int include_extent_data; - int csum_size; struct list_head all_blocks_list; struct btrfsic_block_hashtable block_hashtable; struct btrfsic_block_link_hashtable block_link_hashtable; @@ -660,8 +659,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - state->csum_size = state->fs_info->csum_size; - for (pass = 0; pass < 3; pass++) { int num_copies; int mirror_num; @@ -1723,7 +1720,7 @@ static noinline_for_stack int btrfsic_test_for_metadata( crypto_shash_update(shash, data, sublen); } crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, state->csum_size)) + if (memcmp(csum, h->csum, fs_info->csum_size)) return 1; return 0; /* is metadata */ @@ -2797,7 +2794,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, state->fs_info = fs_info; state->print_mask = print_mask; state->include_extent_data = including_extent_data; - state->csum_size = 0; state->metablock_size = fs_info->nodesize; state->datablock_size = fs_info->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); -- cgit v1.2.3 From 2ae0c2d80d25dae7658b64b93c271004bc8708e8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Jun 2020 17:44:49 +0200 Subject: btrfs: scrub: remove local copy of csum_size from context The context structure unnecessarily stores copy of the checksum size, that can be now easily obtained from fs_info. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 51222534618b..58cd3278fbfe 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -161,7 +161,6 @@ struct scrub_ctx { atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; - u16 csum_size; struct list_head csum_list; atomic_t cancel_req; int readonly; @@ -610,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); - sctx->csum_size = fs_info->csum_size; spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); @@ -1349,7 +1347,7 @@ leave_nomem: if (have_csum) memcpy(page->csum, original_sblock->pagev[0]->csum, - sctx->csum_size); + sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, bbio->map_type, @@ -1800,7 +1798,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) crypto_shash_init(shash); crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - if (memcmp(csum, spage->csum, sctx->csum_size)) + if (memcmp(csum, spage->csum, sctx->fs_info->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; @@ -1823,7 +1821,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) spage = sblock->pagev[0]; kaddr = page_address(spage->page); h = (struct btrfs_header *)kaddr; - memcpy(on_disk_csum, h->csum, sctx->csum_size); + memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); /* * we don't use the getter functions here, as we @@ -1856,7 +1854,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) sblock->checksum_error = 1; return sblock->header_error || sblock->checksum_error; @@ -1893,7 +1891,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - if (memcmp(calculated_csum, s->csum, sctx->csum_size)) + if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -2198,7 +2196,7 @@ leave_nomem: spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->csum_size); + memcpy(spage->csum, csum, sctx->fs_info->csum_size); } else { spage->have_csum = 0; } @@ -2390,7 +2388,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ASSERT(index < UINT_MAX); num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, + sctx->fs_info->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); @@ -2508,7 +2507,7 @@ leave_nomem: spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->csum_size); + memcpy(spage->csum, csum, sctx->fs_info->csum_size); } else { spage->have_csum = 0; } -- cgit v1.2.3 From ac5887c8e013d6754d36e6d51dc03448ee0b0065 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:10 -0400 Subject: btrfs: locking: remove all the blocking helpers Now that we're using a rw_semaphore we no longer need to indicate if a lock is blocking or not, nor do we need to flip the entire path from blocking to spinning. Remove these helpers and all the places they are called. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 10 ++---- fs/btrfs/ctree.c | 91 +++++++----------------------------------------- fs/btrfs/delayed-inode.c | 7 ---- fs/btrfs/disk-io.c | 8 ++--- fs/btrfs/extent-tree.c | 19 ++++------ fs/btrfs/file.c | 3 +- fs/btrfs/inode.c | 1 - fs/btrfs/locking.c | 74 --------------------------------------- fs/btrfs/locking.h | 11 +----- fs/btrfs/qgroup.c | 9 ++--- fs/btrfs/ref-verify.c | 6 ++-- fs/btrfs/relocation.c | 4 --- fs/btrfs/transaction.c | 2 -- fs/btrfs/tree-defrag.c | 1 - fs/btrfs/tree-log.c | 3 -- 15 files changed, 30 insertions(+), 219 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 771a036867dc..deef23b36c5b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1341,14 +1341,12 @@ again: goto out; } - if (!path->skip_locking) { + if (!path->skip_locking) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1685,7 +1683,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, name_off, name_len); if (eb != eb_in) { if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1705,8 +1703,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - if (!path->skip_locking) - btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 113da62dc17f..ac8ebdf4d886 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1278,14 +1278,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (!tm) return eb; - btrfs_set_path_blocking(path); - btrfs_set_lock_blocking_read(eb); - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } @@ -1297,13 +1294,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } else { eb_rewin = btrfs_clone_extent_buffer(eb); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } } - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), @@ -1373,9 +1370,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); } else { - btrfs_set_lock_blocking_read(eb_root); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock_blocking(eb_root); + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); } @@ -1483,10 +1479,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, search_start = buf->start & ~((u64)SZ_1G - 1); - if (parent) - btrfs_set_lock_blocking_write(parent); - btrfs_set_lock_blocking_write(buf); - /* * Before CoWing this block for later modification, check if it's * the subtree root and do the delayed subtree trace if needed. @@ -1604,8 +1596,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems <= 1) return 0; - btrfs_set_lock_blocking_write(parent); - for (i = start_slot; i <= end_slot; i++) { struct btrfs_key first_key; int close = 1; @@ -1663,7 +1653,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); - btrfs_set_lock_blocking_write(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -1835,8 +1824,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && - path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1865,7 +1853,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } btrfs_tree_lock(child); - btrfs_set_lock_blocking_write(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, BTRFS_NESTING_COW); if (ret) { @@ -1904,7 +1891,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1920,7 +1906,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -2084,7 +2069,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2139,7 +2123,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 right_nr; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2399,14 +2382,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - btrfs_set_path_blocking(p); - /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); if (!ret) { @@ -2426,7 +2401,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * out which blocks to read. */ btrfs_unlock_up_safe(p, level + 1); - btrfs_set_path_blocking(p); if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); @@ -2480,7 +2454,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); @@ -2500,7 +2473,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); @@ -2752,7 +2724,6 @@ again: goto again; } - btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, &b, @@ -2822,7 +2793,6 @@ cow_done: goto again; } - btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); @@ -2884,17 +2854,11 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); if (level <= write_lock_level) { - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - } + btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, - p->recurse); - } + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2902,12 +2866,6 @@ cow_done: } ret = 1; done: - /* - * we don't really know what they plan on doing with the path - * from here on, so for now just mark it as blocking - */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; @@ -2999,10 +2957,7 @@ again: } level = btrfs_header_level(b); - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } + btrfs_tree_read_lock(b); b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; @@ -3013,8 +2968,6 @@ again: } ret = 1; done: - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0) btrfs_release_path(p); @@ -3441,7 +3394,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); atomic_inc(&c->refs); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -3814,7 +3767,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -4053,7 +4005,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -4448,7 +4399,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; } - btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); if (ret) goto err; @@ -4478,8 +4428,6 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - btrfs_set_path_blocking(path); - item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -5055,7 +5003,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_set_path_blocking(path); btrfs_clean_tree_block(leaf); btrfs_del_leaf(trans, root, path, leaf); } @@ -5077,7 +5024,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; atomic_inc(&leaf->refs); - btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) @@ -5248,7 +5194,6 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; - btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -5265,7 +5210,6 @@ find_next_key: ret = 0; goto out; } - btrfs_set_path_blocking(path); cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -5282,7 +5226,6 @@ out: path->keep_locks = keep_locks; if (ret == 0) { btrfs_unlock_up_safe(path, path->lowest_level + 1); - btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); } return ret; @@ -5492,7 +5435,6 @@ again: goto again; } if (!ret) { - btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, path->recurse); @@ -5527,13 +5469,8 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, - BTRFS_NESTING_RIGHT, - path->recurse); - } + __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, + path->recurse); next_rw_lock = BTRFS_READ_LOCK; } } @@ -5541,8 +5478,6 @@ again: done: unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; - if (!old_spinning) - btrfs_set_path_blocking(path); return ret; } @@ -5564,7 +5499,6 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; @@ -5606,7 +5540,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5aba81e16113..4b1cb4e17c03 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -740,13 +740,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, goto out; } - /* - * we need allocate some memory space, but it might cause the task - * to sleep, so we set all locked nodes in the path to blocking locks - * first. - */ - btrfs_set_path_blocking(path); - keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); if (!keys) { ret = -ENOMEM; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c70a52b44ceb..212806d59012 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -250,10 +250,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) { + if (need_lock) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); @@ -282,7 +280,7 @@ out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (need_lock) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); return ret; } @@ -1013,8 +1011,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -buf->len, fs_info->dirty_metadata_batch); - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a27caa47aa62..24f3bde8ce5d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4665,7 +4665,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -5054,7 +5053,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -5114,7 +5112,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); } level--; @@ -5126,7 +5123,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -5254,8 +5251,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, @@ -5298,8 +5294,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clean_tree_block(eb); } @@ -5467,9 +5462,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -5497,8 +5491,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, @@ -5685,7 +5678,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5abdc7332dfc..97b5b183272f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -991,8 +991,7 @@ delete_extent_item: * write lock. */ if (!ret && replace_extent && leafs_visited == 1 && - (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || - path->locks[0] == BTRFS_WRITE_LOCK) && + path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + extent_item_size) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d1915049d7e7..205ea4e6f1ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6789,7 +6789,6 @@ next: em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 60e0f00b9b8f..5260660b655a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -50,31 +50,6 @@ * */ -/* - * Mark already held read lock as blocking. Can be nested in write lock by the - * same thread. - * - * Use when there are potentially long operations ahead so other thread waiting - * on the lock will not actively spin but sleep instead. - * - * The rwlock is released and blocking reader counter is increased. - */ -void btrfs_set_lock_blocking_read(struct extent_buffer *eb) -{ -} - -/* - * Mark already held write lock as blocking. - * - * Use when there are potentially long operations ahead so other threads - * waiting on the lock will not actively spin but sleep instead. - * - * The rwlock is released and blocking writers is set. - */ -void btrfs_set_lock_blocking_write(struct extent_buffer *eb) -{ -} - /* * __btrfs_tree_read_lock - lock extent buffer for read * @eb: the eb to be locked @@ -130,17 +105,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); } -/* - * Lock extent buffer for read, optimistically expecting that there are no - * contending blocking writers. If there are, don't wait. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) -{ - return btrfs_try_tree_read_lock(eb); -} - /* * Try-lock for read. * @@ -192,18 +156,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) up_read(&eb->lock); } -/* - * Release read lock, previously set to blocking by a pairing call to - * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same - * thread. - * - * State of rwlock is unchanged, last reader wakes waiting threads. - */ -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) -{ - btrfs_tree_read_unlock(eb); -} - /* * __btrfs_tree_lock - lock eb for write * @eb: the eb to lock @@ -239,32 +191,6 @@ void btrfs_tree_unlock(struct extent_buffer *eb) up_write(&eb->lock); } -/* - * Set all locked nodes in the path to blocking locks. This should be done - * before scheduling - */ -void btrfs_set_path_blocking(struct btrfs_path *p) -{ - int i; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - /* - * If we currently have a spinning reader or writer lock this - * will bump the count of blocking holders and drop the - * spinlock. - */ - if (p->locks[i] == BTRFS_READ_LOCK) { - btrfs_set_lock_blocking_read(p->nodes[i]); - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - } else if (p->locks[i] == BTRFS_WRITE_LOCK) { - btrfs_set_lock_blocking_write(p->nodes[i]); - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } - } -} - /* * This releases any locks held in the path starting at level and going all the * way up to the root. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7c27f142f7d2..f8f2fd835582 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -13,8 +13,6 @@ #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 -#define BTRFS_WRITE_LOCK_BLOCKING 3 -#define BTRFS_READ_LOCK_BLOCKING 4 /* * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at @@ -93,12 +91,8 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_read(struct extent_buffer *eb); -void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, bool recurse); @@ -116,15 +110,12 @@ static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } #endif -void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { - if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + if (rw == BTRFS_WRITE_LOCK) btrfs_tree_unlock(eb); - else if (rw == BTRFS_READ_LOCK_BLOCKING) - btrfs_tree_read_unlock_blocking(eb); else if (rw == BTRFS_READ_LOCK) btrfs_tree_read_unlock(eb); else diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 87bd37b70738..9e97d3172a24 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1970,8 +1970,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + src_path->locks[cur_level] = BTRFS_READ_LOCK; } src_path->slots[cur_level] = dst_path->slots[cur_level]; @@ -2111,8 +2110,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + dst_path->locks[cur_level] = BTRFS_READ_LOCK; need_cleanup = true; } @@ -2286,8 +2284,7 @@ walk_down: path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize, diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 14b0c0dcb08c..488bc3dd3c2b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -575,10 +575,9 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, return -EIO; } btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; - path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level-1] = BTRFS_READ_LOCK; } else { ret = process_leaf(root, path, bytenr, num_bytes); if (ret) @@ -1000,11 +999,10 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) return -ENOMEM; eb = btrfs_read_lock_root_node(fs_info->extent_root); - btrfs_set_lock_blocking_read(eb); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; while (1) { /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3d4618a01ef3..0b3ccf464c3d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1196,7 +1196,6 @@ again: btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking_write(eb); level = btrfs_header_level(eb); if (level < lowest_level) { @@ -1210,7 +1209,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); if (next_key) { next_key->objectid = (u64)-1; @@ -1279,7 +1277,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -2309,7 +2306,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8f70d7135497..931bef8cdd4c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1598,8 +1598,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_set_lock_blocking_write(old); - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ btrfs_tree_unlock(old); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index d3f28b8f4ff9..7c45d960b53c 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,7 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 135cb40295c1..eb86c632535a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2722,7 +2722,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2791,7 +2790,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2873,7 +2871,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); -- cgit v1.2.3 From b9729ce014f6c22d4ca7fda97a7d8ea432af5f91 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:11 -0400 Subject: btrfs: locking: rip out path->leave_spinning We no longer distinguish between blocking and spinning, so rip out all this code. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 3 --- fs/btrfs/ctree.c | 3 --- fs/btrfs/ctree.h | 1 - fs/btrfs/delayed-inode.c | 4 ---- fs/btrfs/dir-item.c | 1 - fs/btrfs/export.c | 1 - fs/btrfs/extent-tree.c | 8 -------- fs/btrfs/extent_io.c | 1 - fs/btrfs/file-item.c | 4 ---- fs/btrfs/free-space-tree.c | 2 -- fs/btrfs/inode-item.c | 6 ------ fs/btrfs/inode.c | 12 ------------ fs/btrfs/ioctl.c | 1 - fs/btrfs/qgroup.c | 2 -- fs/btrfs/reflink.c | 3 --- fs/btrfs/super.c | 2 -- fs/btrfs/tests/qgroup-tests.c | 4 ---- 17 files changed, 58 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index deef23b36c5b..86abe34e9444 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1669,13 +1669,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; - int leave_spinning = path->leave_spinning; struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; - path->leave_spinning = 1; while (1) { bytes_left -= name_len; if (bytes_left >= 0) @@ -1719,7 +1717,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); - path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ac8ebdf4d886..d2d5854d51a7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5327,7 +5327,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key key; u32 nritems; int ret; - int old_spinning = path->leave_spinning; int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); @@ -5342,7 +5341,6 @@ again: btrfs_release_path(path); path->keep_locks = 1; - path->leave_spinning = 1; if (time_seq) ret = btrfs_search_old_slot(root, &key, path, time_seq); @@ -5477,7 +5475,6 @@ again: ret = 0; done: unlock_up(path, 0, 1, 0, NULL); - path->leave_spinning = old_spinning; return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f28960647c01..5e628dbce111 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -371,7 +371,6 @@ struct btrfs_path { unsigned int search_for_split:1; unsigned int keep_locks:1; unsigned int skip_locking:1; - unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 4b1cb4e17c03..b2a883ec8e7e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1147,7 +1147,6 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; @@ -1212,7 +1211,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, btrfs_release_delayed_node(delayed_node); return -ENOMEM; } - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; @@ -1257,7 +1255,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) ret = -ENOMEM; goto trans_out; } - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; @@ -1326,7 +1323,6 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!delayed_node) break; - path->leave_spinning = 1; root = delayed_node->root; trans = btrfs_join_transaction(root); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 863367c2c620..98b63ebed539 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -127,7 +127,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; btrfs_cpu_key_to_disk(&disk_key, location); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1a8d419d9e1f..1d4c2397d0d6 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -222,7 +222,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; if (ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = BTRFS_I(inode)->root->root_key.objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 24f3bde8ce5d..5bb0c56f7294 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1465,7 +1465,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, parent, root_objectid, owner, @@ -1489,7 +1488,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->leave_spinning = 1; /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { BUG_ON(refs_to_add != 1); @@ -1605,7 +1603,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { err = ret; @@ -3021,8 +3018,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; if (!is_data && refs_to_drop != 1) { @@ -3087,7 +3082,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); - path->leave_spinning = 1; /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; @@ -4429,7 +4423,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { @@ -4514,7 +4507,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, &extent_key, size); if (ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f3515d3c1321..119ced4a501b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4696,7 +4696,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; roots = ulist_alloc(GFP_KERNEL); tmp_ulist = ulist_alloc(GFP_KERNEL); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5f3096ea69af..40daf1a4b46c 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -142,7 +142,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -706,7 +705,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -990,10 +988,8 @@ insert: } else { ins_size = csum_size; } - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); - path->leave_spinning = 0; if (ret < 0) goto out; if (WARN_ON(ret != 0)) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 37d210148c1d..e33a65bd9a0c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1193,8 +1193,6 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - key.objectid = 0; key.type = 0; key.offset = 0; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 668701832845..37f36ffdaf6b 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -119,8 +119,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; @@ -193,8 +191,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -270,7 +266,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -327,7 +322,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 205ea4e6f1ec..d3b2740f4d70 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -234,7 +234,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) @@ -2596,7 +2595,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) @@ -3588,7 +3586,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -3677,7 +3674,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { @@ -6129,7 +6125,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, goto fail; } - path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail_unlock; @@ -6680,13 +6675,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; - - /* - * Unless we're going to uncompress the inline extent, no sleep would - * happen. - */ - path->leave_spinning = 1; - path->recurse = btrfs_is_free_space_inode(inode); ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 69a384145dc6..ea40a19cc4cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3393,7 +3393,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = -ENOMEM; goto out_free; } - path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e97d3172a24..78ffbad9b03d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -894,8 +894,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - key.objectid = 0; key.offset = 0; key.type = 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 99aa87c08912..5d0bb7c3dc33 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -347,7 +347,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 drop_start; /* Note the key will change type as we walk through the tree */ - path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) @@ -417,7 +416,6 @@ process_slot: size); btrfs_release_path(path); - path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); @@ -533,7 +531,6 @@ process_slot: * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); - path->leave_spinning = 0; ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bdb0b0c5fd64..6693cfc14dfd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1163,7 +1163,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto err; } - path->leave_spinning = 1; name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { @@ -1292,7 +1291,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; /* * Find the "default" dir item which points to the root item that we diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ce1ca8e73c2d..f3137285a9e2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -36,7 +36,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { test_err("couldn't insert ref %d", ret); @@ -86,7 +85,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); @@ -135,7 +133,6 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { @@ -170,7 +167,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); -- cgit v1.2.3 From dc516164869300efd0bdbf6f894defc306588b75 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 29 Oct 2020 15:33:45 +0100 Subject: btrfs: reorder extent buffer members for better packing After the rwsem replaced the tree lock implementation, the extent buffer got smaller but leaving some holes behind. By changing log_index type and reordering, we can squeeze the size further to 240 bytes, measured on release config on x86_64. Log_index spans only 3 values and needs to be signed. Before: struct extent_buffer { u64 start; /* 0 8 */ long unsigned int len; /* 8 8 */ long unsigned int bflags; /* 16 8 */ struct btrfs_fs_info * fs_info; /* 24 8 */ spinlock_t refs_lock; /* 32 4 */ atomic_t refs; /* 36 4 */ atomic_t io_pages; /* 40 4 */ int read_mirror; /* 44 4 */ struct callback_head callback_head __attribute__((__aligned__(8))); /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ pid_t lock_owner; /* 64 4 */ bool lock_recursed; /* 68 1 */ /* XXX 3 bytes hole, try to pack */ struct rw_semaphore lock; /* 72 40 */ short int log_index; /* 112 2 */ /* XXX 6 bytes hole, try to pack */ struct page * pages[16]; /* 120 128 */ /* size: 248, cachelines: 4, members: 14 */ /* sum members: 239, holes: 2, sum holes: 9 */ /* forced alignments: 1 */ /* last cacheline: 56 bytes */ } __attribute__((__aligned__(8))); After: struct extent_buffer { u64 start; /* 0 8 */ long unsigned int len; /* 8 8 */ long unsigned int bflags; /* 16 8 */ struct btrfs_fs_info * fs_info; /* 24 8 */ spinlock_t refs_lock; /* 32 4 */ atomic_t refs; /* 36 4 */ atomic_t io_pages; /* 40 4 */ int read_mirror; /* 44 4 */ struct callback_head callback_head __attribute__((__aligned__(8))); /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ pid_t lock_owner; /* 64 4 */ bool lock_recursed; /* 68 1 */ s8 log_index; /* 69 1 */ /* XXX 2 bytes hole, try to pack */ struct rw_semaphore lock; /* 72 40 */ struct page * pages[16]; /* 112 128 */ /* size: 240, cachelines: 4, members: 14 */ /* sum members: 238, holes: 1, sum holes: 2 */ /* forced alignments: 1 */ /* last cacheline: 48 bytes */ } __attribute__((__aligned__(8))); Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5403354de0e1..3c2bf21c54eb 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -88,10 +88,10 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; bool lock_recursed; - struct rw_semaphore lock; - /* >= 0 if eb belongs to a log tree, -1 otherwise */ - short log_index; + s8 log_index; + + struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG -- cgit v1.2.3 From 9076dbd5ee837c3882fc42891c14cecd0354a849 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:04 -0400 Subject: btrfs: do not shorten unpin len for caching block groups While fixing up our ->last_byte_to_unpin locking I noticed that we will shorten len based on ->last_byte_to_unpin if we're caching when we're adding back the free space. This is correct for the free space, as we cannot unpin more than ->last_byte_to_unpin, however we use len to adjust the ->bytes_pinned counters and such, which need to track the actual pinned usage. This could result in WARN_ON(space_info->bytes_pinned) triggering at unmount time. Fix this by using a local variable for the amount to add to free space cache, and leave len untouched in this case. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5bb0c56f7294..0345f9d97964 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2794,10 +2794,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - if (start < cache->last_byte_to_unpin) { - len = min(len, cache->last_byte_to_unpin - start); - if (return_free_space) - btrfs_add_free_space(cache, start, len); + if (start < cache->last_byte_to_unpin && return_free_space) { + u64 add_len = min(len, cache->last_byte_to_unpin - start); + + btrfs_add_free_space(cache, start, add_len); } start += len; -- cgit v1.2.3 From 27d56e62e4748c2135650c260024e9904b8c1a0a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:05 -0400 Subject: btrfs: update last_byte_to_unpin in switch_commit_roots While writing an explanation for the need of the commit_root_sem for btrfs_prepare_extent_commit, I realized we have a slight hole that could result in leaked space if we have to do the old style caching. Consider the following scenario commit root +----+----+----+----+----+----+----+ |\\\\| |\\\\|\\\\| |\\\\|\\\\| +----+----+----+----+----+----+----+ 0 1 2 3 4 5 6 7 new commit root +----+----+----+----+----+----+----+ | | | |\\\\| | |\\\\| +----+----+----+----+----+----+----+ 0 1 2 3 4 5 6 7 Prior to this patch, we run btrfs_prepare_extent_commit, which updates the last_byte_to_unpin, and then we subsequently run switch_commit_roots. In this example lets assume that caching_ctl->progress == 1 at btrfs_prepare_extent_commit() time, which means that cache->last_byte_to_unpin == 1. Then we go and do the switch_commit_roots(), but in the meantime the caching thread has made some more progress, because we drop the commit_root_sem and re-acquired it. Now caching_ctl->progress == 3. We swap out the commit root and carry on to unpin. The race can happen like: 1) The caching thread was running using the old commit root when it found the extent for [2, 3); 2) Then it released the commit_root_sem because it was in the last item of a leaf and the semaphore was contended, and set ->progress to 3 (value of 'last'), as the last extent item in the current leaf was for the extent for range [2, 3); 3) Next time it gets the commit_root_sem, will start using the new commit root and search for a key with offset 3, so it never finds the hole for [2, 3). So the caching thread never saw [2, 3) as free space in any of the commit roots, and by the time finish_extent_commit() was called for the range [0, 3), ->last_byte_to_unpin was 1, so it only returned the subrange [0, 1) to the free space cache, skipping [2, 3). In the unpin code we have last_byte_to_unpin == 1, so we unpin [0,1), but do not unpin [2,3). However because caching_ctl->progress == 3 we do not see the newly freed section of [2,3), and thus do not add it to our free space cache. This results in us missing a chunk of free space in memory (on disk too, unless we have a power failure before writing the free space cache to disk). Fix this by making sure the ->last_byte_to_unpin is set at the same time that we swap the commit roots, this ensures that we will always be consistent. CC: stable@vger.kernel.org # 5.8+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik [ update changelog with Filipe's review comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/extent-tree.c | 25 ------------------------- fs/btrfs/transaction.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 28 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e628dbce111..2aa3d882aede 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2610,7 +2610,6 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0345f9d97964..27523e6b6c2d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2708,31 +2708,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) atomic_inc(&bg->reservations); } -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_caching_control *next; - struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group *cache; - - down_write(&fs_info->commit_root_sem); - - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - cache = caching_ctl->block_group; - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - - up_write(&fs_info->commit_root_sem); - - btrfs_update_global_block_rsv(fs_info); -} - /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 931bef8cdd4c..89597ea57115 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -155,6 +155,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; + struct btrfs_caching_control *caching_ctl, *next; down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, @@ -180,6 +181,45 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) spin_lock(&cur_trans->dropped_roots_lock); } spin_unlock(&cur_trans->dropped_roots_lock); + + /* + * We have to update the last_byte_to_unpin under the commit_root_sem, + * at the same time we swap out the commit roots. + * + * This is because we must have a real view of the last spot the caching + * kthreads were while caching. Consider the following views of the + * extent tree for a block group + * + * commit root + * +----+----+----+----+----+----+----+ + * |\\\\| |\\\\|\\\\| |\\\\|\\\\| + * +----+----+----+----+----+----+----+ + * 0 1 2 3 4 5 6 7 + * + * new commit root + * +----+----+----+----+----+----+----+ + * | | | |\\\\| | |\\\\| + * +----+----+----+----+----+----+----+ + * 0 1 2 3 4 5 6 7 + * + * If the cache_ctl->progress was at 3, then we are only allowed to + * unpin [0,1) and [2,3], because the caching thread has already + * processed those extents. We are not allowed to unpin [5,6), because + * the caching thread will re-start it's search from 3, and thus find + * the hole from [4,6) to add to the free space cache. + */ + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + struct btrfs_block_group *cache = caching_ctl->block_group; + + if (btrfs_block_group_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; + } + } up_write(&fs_info->commit_root_sem); } @@ -2254,8 +2294,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto unlock_tree_log; } - btrfs_prepare_extent_commit(fs_info); - cur_trans = fs_info->running_transaction; btrfs_set_root_node(&fs_info->tree_root->root_item, -- cgit v1.2.3 From 2ca08c56e813323ee470f7fd8d836f30600e3960 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:06 -0400 Subject: btrfs: explicitly protect ->last_byte_to_unpin in unpin_extent_range Currently unpin_extent_range happens in the transaction commit context, so we are protected from ->last_byte_to_unpin changing while we're unpinning, because any new transactions would have to wait for us to complete before modifying ->last_byte_to_unpin. However in the future we may want to change how this works, for instance with async unpinning or other such TODO items. To prepare for that future explicitly protect ->last_byte_to_unpin with the commit_root_sem so we are sure it won't change while we're doing our work. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 27523e6b6c2d..d7a68203cda0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2769,11 +2769,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); + down_read(&fs_info->commit_root_sem); if (start < cache->last_byte_to_unpin && return_free_space) { u64 add_len = min(len, cache->last_byte_to_unpin - start); btrfs_add_free_space(cache, start, add_len); } + up_read(&fs_info->commit_root_sem); start += len; total_unpinned += len; -- cgit v1.2.3 From 66b53bae46c84e00e276ee6e539eedfbcfe78573 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:07 -0400 Subject: btrfs: cleanup btrfs_discard_update_discardable usage This passes in the block_group and the free_space_ctl, but we can get this from the block group itself. Part of this is because we call it from __load_free_space_cache, which can be called for the inode cache as well. Move that call into the block group specific load section, wrap it in the right lock that we need for the assertion (but otherwise this is safe without the lock because this happens in single-thread context). Fix up the arguments to only take the block group. Add a lockdep_assert as well for good measure to make sure we don't mess up the locking again. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/discard.c | 7 ++++--- fs/btrfs/discard.h | 3 +-- fs/btrfs/free-space-cache.c | 14 ++++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 741c7e19c32f..5a88b584276f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -563,15 +563,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) /** * btrfs_discard_update_discardable - propagate discard counters * @block_group: block_group of interest - * @ctl: free_space_ctl of @block_group * * This propagates deltas of counters up to the discard_ctl. It maintains a * current counter and a previous counter passing the delta up to the global * stat. Then the current counter value becomes the previous counter value. */ -void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, - struct btrfs_free_space_ctl *ctl) +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { + struct btrfs_free_space_ctl *ctl; struct btrfs_discard_ctl *discard_ctl; s32 extents_delta; s64 bytes_delta; @@ -581,8 +580,10 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, !btrfs_is_block_group_data_only(block_group)) return; + ctl = block_group->free_space_ctl; discard_ctl = &block_group->fs_info->discard_ctl; + lockdep_assert_held(&ctl->tree_lock); extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - ctl->discardable_extents[BTRFS_STAT_PREV]; if (extents_delta) { diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 353228d62f5a..57b9202f427f 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -28,8 +28,7 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); /* Update operations */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); -void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, - struct btrfs_free_space_ctl *ctl); +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group); /* Setup/cleanup operations */ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5ea36a06e514..0787339c7b93 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -828,7 +828,6 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, merge_space_tree(ctl); ret = 1; out: - btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: @@ -929,6 +928,9 @@ out: block_group->start); } + spin_lock(&ctl->tree_lock); + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); iput(inode); return ret; } @@ -2508,7 +2510,7 @@ link: if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2643,7 +2645,7 @@ again: goto again; } out_lock: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); out: return ret; @@ -2779,7 +2781,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); if (ctl->private) - btrfs_discard_update_discardable(ctl->private, ctl); + btrfs_discard_update_discardable(ctl->private); spin_unlock(&ctl->tree_lock); } @@ -2801,7 +2803,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } @@ -2885,7 +2887,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, link_free_space(ctl, entry); } out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (align_gap_len) -- cgit v1.2.3 From cd79909bc7cdd8043a22d699aae1e8435792c824 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:08 -0400 Subject: btrfs: load free space cache into a temporary ctl The free space cache has been special in that we would load it right away instead of farming the work off to a worker thread. This resulted in some weirdness that had to be taken into account for this fact, namely that if we every found a block group being cached the fast way we had to wait for it to finish, because we could get the cache before it had been validated and we may throw the cache away. To handle this particular case instead create a temporary btrfs_free_space_ctl to load the free space cache into. Then once we've validated that it makes sense, copy it's contents into the actual block_group->free_space_ctl. This allows us to avoid the problems of needing to wait for the caching to complete, we can clean up the discard extent handling stuff in __load_free_space_cache, and we no longer need to do the merge_space_tree() because the space is added one by one into the real free_space_ctl. This will allow further reworks of how we handle loading the free space cache. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 29 +------- fs/btrfs/free-space-cache.c | 155 ++++++++++++++++++------------------------- fs/btrfs/free-space-cache.h | 3 +- fs/btrfs/tests/btrfs-tests.c | 2 +- 4 files changed, 70 insertions(+), 119 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bb6685711824..adbd18dc08a1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -695,33 +695,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); - - schedule(); - - finish_wait(&ctl->wait, &wait); - btrfs_put_caching_control(ctl); - spin_lock(&cache->lock); - } - if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); kfree(caching_ctl); @@ -1805,7 +1778,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); - btrfs_init_free_space_ctl(cache); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0787339c7b93..58bd2d3e54db 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -33,8 +33,6 @@ struct btrfs_trim_range { struct list_head list; }; -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -43,6 +41,14 @@ static int btrfs_wait_cache_io_root(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path); +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes, bool for_alloc); +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -625,44 +631,6 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, return 0; } -/* - * Since we attach pinned extents after the fact we can have contiguous sections - * of free space that are split up in entries. This poses a problem with the - * tree logging stuff since it could have allocated across what appears to be 2 - * entries since we would have merged the entries when adding the pinned extents - * back to the free space cache. So run through the space cache that we just - * loaded and merge contiguous entries. This will make the log replay stuff not - * blow up and it will make for nicer allocator behavior. - */ -static void merge_space_tree(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *e, *prev = NULL; - struct rb_node *n; - -again: - spin_lock(&ctl->tree_lock); - for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { - e = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev) - goto next; - if (e->bitmap || prev->bitmap) - goto next; - if (prev->offset + prev->bytes == e->offset) { - unlink_free_space(ctl, prev); - unlink_free_space(ctl, e); - prev->bytes += e->bytes; - kmem_cache_free(btrfs_free_space_cachep, e); - link_free_space(ctl, prev); - prev = NULL; - spin_unlock(&ctl->tree_lock); - goto again; - } -next: - prev = e; - } - spin_unlock(&ctl->tree_lock); -} - static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) @@ -753,16 +721,6 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } - /* - * Sync discard ensures that the free space cache is always - * trimmed. So when reading this in, the state should reflect - * that. We also do this for async as a stop gap for lack of - * persistence. - */ - if (btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC)) - e->trim_state = BTRFS_TRIM_STATE_TRIMMED; - if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -816,16 +774,9 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; - e->bitmap_extents = count_bitmap_extents(ctl, e); - if (!btrfs_free_space_trimmed(e)) { - ctl->discardable_extents[BTRFS_STAT_CURR] += - e->bitmap_extents; - ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; - } } io_ctl_drop_pages(&io_ctl); - merge_space_tree(ctl); ret = 1; out: io_ctl_free(&io_ctl); @@ -836,16 +787,59 @@ free_cache: goto out; } +static int copy_free_space_cache(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int ret = 0; + + while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info); + ret = btrfs_add_free_space(block_group, info->offset, + info->bytes); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + u64 offset = info->offset; + u64 bytes = ctl->unit; + + while (search_bitmap(ctl, info, &offset, &bytes, + false) == 0) { + ret = btrfs_add_free_space(block_group, offset, + bytes); + if (ret) + break; + bitmap_clear_bits(ctl, info, offset, bytes); + offset = info->offset; + bytes = ctl->unit; + } + free_bitmap(ctl, info); + } + cond_resched(); + } + return ret; +} + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space_ctl tmp_ctl = {}; struct inode *inode; struct btrfs_path *path; int ret = 0; bool matched; u64 used = block_group->used; + /* + * Because we could potentially discard our loaded free space, we want + * to load everything into a temporary structure first, and then if it's + * valid copy it all into the actual free space ctl. + */ + btrfs_init_free_space_ctl(block_group, &tmp_ctl); + /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. @@ -897,19 +891,25 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); - ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; - spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->length - used - - block_group->bytes_super)); - spin_unlock(&ctl->tree_lock); + matched = (tmp_ctl.free_space == (block_group->length - used - + block_group->bytes_super)); - if (!matched) { - __btrfs_remove_free_space_cache(ctl); + if (matched) { + ret = copy_free_space_cache(block_group, &tmp_ctl); + /* + * ret == 1 means we successfully loaded the free space cache, + * so we need to re-set it here. + */ + if (ret == 0) + ret = 1; + } else { + __btrfs_remove_free_space_cache(&tmp_ctl); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -1914,29 +1914,6 @@ out: return NULL; } -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info) -{ - struct btrfs_block_group *block_group = ctl->private; - u64 bytes = bitmap_info->bytes; - unsigned int rs, re; - int count = 0; - - if (!block_group || !bytes) - return count; - - bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, - BITS_PER_BITMAP) { - bytes -= (rs - re) * ctl->unit; - count++; - - if (!bytes) - break; - } - - return count; -} - static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { @@ -2676,10 +2653,10 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, "%d blocks of free space at or bigger than bytes is", count); } -void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e3d5e0ad8f8e..bf8d127d2407 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -109,7 +109,8 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode); -void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b36e88753f1f..8ca334d554af 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -225,7 +225,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); - btrfs_init_free_space_ctl(cache); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); mutex_init(&cache->free_space_lock); return cache; -- cgit v1.2.3 From 4d7240f0abda6a75ce54e8d488db2e4ca4460185 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:09 -0400 Subject: btrfs: load the free space cache inode extents from commit root Historically we've allowed recursive locking specifically for the free space inode. This is because we are only doing reads and know that it's safe. However we don't actually need this feature, we can get away with reading the commit root for the extents. In fact if we want to allow asynchronous loading of the free space cache we have to use the commit root, otherwise we will deadlock. Switch to using the commit root for the file extents. These are only read at load time, and are replaced as soon as we start writing the cache out to disk. The cache is never read again, so this is legitimate. This matches what we do for the inode itself, as we read that from the commit root as well. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3b2740f4d70..d2ef8308ad0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6675,7 +6675,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; - path->recurse = btrfs_is_free_space_inode(inode); + + /* + * The same explanation in load_free_space_cache applies here as well, + * we only read when we're loading the free space cache, and at that + * point the commit_root has everything we need. + */ + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { -- cgit v1.2.3 From e747853cae3ae332ce81cf1c12d8b3df45041949 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:10 -0400 Subject: btrfs: load free space cache asynchronously While documenting the usage of the commit_root_sem, I noticed that we do not actually take the commit_root_sem in the case of the free space cache. This is problematic because we're supposed to hold that sem while we're reading the commit roots, which is what we do for the free space cache. The reason I did it inline when I originally wrote the code was because there's the case of unpinning where we need to make sure that the free space cache is loaded if we're going to use the free space cache. But we can accomplish the same thing by simply waiting for the cache to be loaded. Rework this code to load the free space cache asynchronously. This allows us to greatly cleanup the caching code because now it's all shared by the various caching methods. We also are now in a position to have the commit_root semaphore held while we're loading the free space cache. And finally our modification of ->last_byte_to_unpin is removed because it can be handled in the proper way on commit. Some care must be taken when replaying the log, when we expect that the free space cache will be read entirely before we start excluding space to replay. This could lead to overwriting space during replay. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 123 +++++++++++++++++++++---------------------------- fs/btrfs/block-group.h | 2 + fs/btrfs/extent-tree.c | 5 ++ 3 files changed, 60 insertions(+), 70 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index adbd18dc08a1..b922bebb29cc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -424,6 +424,23 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) return ret; } +static bool space_cache_v1_done(struct btrfs_block_group *cache) +{ + bool ret; + + spin_lock(&cache->lock); + ret = cache->cached != BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + return ret; +} + +void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, space_cache_v1_done(cache)); +} + #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { @@ -639,11 +656,28 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + ret = load_free_space_cache(block_group); + if (ret == 1) { + ret = 0; + goto done; + } + + /* + * We failed to load the space cache, set ourselves to + * CACHE_STARTED and carry on. + */ + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_STARTED; + spin_unlock(&block_group->lock); + wake_up(&caching_ctl->wait); + } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); - +done: spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; @@ -679,7 +713,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only { DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; + struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); @@ -691,84 +725,28 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->start; - refcount_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 2); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); kfree(caching_ctl); - return 0; + + caching_ctl = cache->caching_ctl; + if (caching_ctl) + refcount_inc(&caching_ctl->count); + spin_unlock(&cache->lock); + goto out; } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + cache->cached = BTRFS_CACHE_FAST; + else + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - caching_ctl->progress = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - } - spin_unlock(&cache->lock); -#ifdef CONFIG_BTRFS_DEBUG - if (ret == 1 && - btrfs_should_fragment_free_space(cache)) { - u64 bytes_used; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - bytes_used = cache->length - cache->used; - cache->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fragment_free_space(cache); - } -#endif - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - if (ret == 1) { - btrfs_put_caching_control(caching_ctl); - btrfs_free_excluded_extents(cache); - return 0; - } - } else { - /* - * We're either using the free space tree or no caching at all. - * Set cached to the appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - } - - if (load_cache_only) { - btrfs_put_caching_control(caching_ctl); - return 0; - } - down_write(&fs_info->commit_root_sem); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); @@ -777,6 +755,11 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only btrfs_get_block_group(cache); btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); +out: + if (load_cache_only && caching_ctl) + btrfs_wait_space_cache_v1_finished(cache, caching_ctl); + if (caching_ctl) + btrfs_put_caching_control(caching_ctl); return ret; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index adfd7583a17b..8f74a96074f7 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -268,6 +268,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); +void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d7a68203cda0..5c82cfdb0944 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2641,6 +2641,11 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, BUG_ON(!btrfs_block_group_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); } else { + /* + * We must wait for v1 caching to finish, otherwise we may not + * remove our space. + */ + btrfs_wait_space_cache_v1_finished(block_group, caching_ctl); mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { -- cgit v1.2.3 From bbb86a3717917c7b16da545f9c421ab6a3448306 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Oct 2020 09:58:11 -0400 Subject: btrfs: protect fs_info->caching_block_groups by block_group_cache_lock I got the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.9.0+ #101 Not tainted ------------------------------------------------------ btrfs-cleaner/3445 is trying to acquire lock: ffff89dbec39ab48 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x170 but task is already holding lock: ffff89dbeaf28a88 (&fs_info->commit_root_sem){++++}-{3:3}, at: btrfs_find_all_roots+0x41/0x80 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&fs_info->commit_root_sem){++++}-{3:3}: down_write+0x3d/0x70 btrfs_cache_block_group+0x2d5/0x510 find_free_extent+0xb6e/0x12f0 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb1/0x330 alloc_tree_block_no_bg_flush+0x4f/0x60 __btrfs_cow_block+0x11d/0x580 btrfs_cow_block+0x10c/0x220 commit_cowonly_roots+0x47/0x2e0 btrfs_commit_transaction+0x595/0xbd0 sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x36/0xa0 cleanup_mnt+0x12d/0x190 task_work_run+0x5c/0xa0 exit_to_user_mode_prepare+0x1df/0x200 syscall_exit_to_user_mode+0x54/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&space_info->groups_sem){++++}-{3:3}: down_read+0x40/0x130 find_free_extent+0x2ed/0x12f0 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb1/0x330 alloc_tree_block_no_bg_flush+0x4f/0x60 __btrfs_cow_block+0x11d/0x580 btrfs_cow_block+0x10c/0x220 commit_cowonly_roots+0x47/0x2e0 btrfs_commit_transaction+0x595/0xbd0 sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x36/0xa0 cleanup_mnt+0x12d/0x190 task_work_run+0x5c/0xa0 exit_to_user_mode_prepare+0x1df/0x200 syscall_exit_to_user_mode+0x54/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (btrfs-root-00){++++}-{3:3}: __lock_acquire+0x1167/0x2150 lock_acquire+0xb9/0x3d0 down_read_nested+0x43/0x130 __btrfs_tree_read_lock+0x32/0x170 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x614/0x9d0 btrfs_find_root+0x35/0x1b0 btrfs_read_tree_root+0x61/0x120 btrfs_get_root_ref+0x14b/0x600 find_parent_nodes+0x3e6/0x1b30 btrfs_find_all_roots_safe+0xb4/0x130 btrfs_find_all_roots+0x60/0x80 btrfs_qgroup_trace_extent_post+0x27/0x40 btrfs_add_delayed_data_ref+0x3fd/0x460 btrfs_free_extent+0x42/0x100 __btrfs_mod_ref+0x1d7/0x2f0 walk_up_proc+0x11c/0x400 walk_up_tree+0xf0/0x180 btrfs_drop_snapshot+0x1c7/0x780 btrfs_clean_one_deleted_snapshot+0xfb/0x110 cleaner_kthread+0xd4/0x140 kthread+0x13a/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: btrfs-root-00 --> &space_info->groups_sem --> &fs_info->commit_root_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->commit_root_sem); lock(&space_info->groups_sem); lock(&fs_info->commit_root_sem); lock(btrfs-root-00); *** DEADLOCK *** 3 locks held by btrfs-cleaner/3445: #0: ffff89dbeaf28838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: cleaner_kthread+0x6e/0x140 #1: ffff89dbeb6c7640 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x40b/0x5c0 #2: ffff89dbeaf28a88 (&fs_info->commit_root_sem){++++}-{3:3}, at: btrfs_find_all_roots+0x41/0x80 stack backtrace: CPU: 0 PID: 3445 Comm: btrfs-cleaner Not tainted 5.9.0+ #101 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb0 check_noncircular+0xcf/0xf0 __lock_acquire+0x1167/0x2150 ? __bfs+0x42/0x210 lock_acquire+0xb9/0x3d0 ? __btrfs_tree_read_lock+0x32/0x170 down_read_nested+0x43/0x130 ? __btrfs_tree_read_lock+0x32/0x170 __btrfs_tree_read_lock+0x32/0x170 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x614/0x9d0 ? find_held_lock+0x2b/0x80 btrfs_find_root+0x35/0x1b0 ? do_raw_spin_unlock+0x4b/0xa0 btrfs_read_tree_root+0x61/0x120 btrfs_get_root_ref+0x14b/0x600 find_parent_nodes+0x3e6/0x1b30 btrfs_find_all_roots_safe+0xb4/0x130 btrfs_find_all_roots+0x60/0x80 btrfs_qgroup_trace_extent_post+0x27/0x40 btrfs_add_delayed_data_ref+0x3fd/0x460 btrfs_free_extent+0x42/0x100 __btrfs_mod_ref+0x1d7/0x2f0 walk_up_proc+0x11c/0x400 walk_up_tree+0xf0/0x180 btrfs_drop_snapshot+0x1c7/0x780 ? btrfs_clean_one_deleted_snapshot+0x73/0x110 btrfs_clean_one_deleted_snapshot+0xfb/0x110 cleaner_kthread+0xd4/0x140 ? btrfs_alloc_root+0x50/0x50 kthread+0x13a/0x150 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 while testing another lockdep fix. This happens because we're using the commit_root_sem to protect fs_info->caching_block_groups, which creates a dependency on the groups_sem -> commit_root_sem, which is problematic because we will allocate blocks while holding tree roots. Fix this by making the list itself protected by the fs_info->block_group_cache_lock. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 ++++++------ fs/btrfs/transaction.c | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b922bebb29cc..35be6dbca5e8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -747,10 +747,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - down_write(&fs_info->commit_root_sem); + spin_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->commit_root_sem); + spin_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); @@ -999,7 +999,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - down_write(&fs_info->commit_root_sem); + spin_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -1013,7 +1013,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - up_write(&fs_info->commit_root_sem); + spin_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -3311,14 +3311,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->commit_root_sem); + spin_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - up_write(&info->commit_root_sem); + spin_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 89597ea57115..b671ea4d80e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -208,6 +208,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * the caching thread will re-start it's search from 3, and thus find * the hole from [4,6) to add to the free space cache. */ + spin_lock(&fs_info->block_group_cache_lock); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { struct btrfs_block_group *cache = caching_ctl->block_group; @@ -220,6 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) cache->last_byte_to_unpin = caching_ctl->progress; } } + spin_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } -- cgit v1.2.3 From 0d01e247a06b9f36f685edf6c2e74f79f60df9cd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:25:02 +0800 Subject: btrfs: assert page mapping lock in attach_extent_buffer_page When calling attach_extent_buffer_page(), either we're attaching anonymous pages, called from btrfs_clone_extent_buffer(), or we're attaching btree inode pages, called from alloc_extent_buffer(). For the latter case, we should hold page->mapping->private_lock to avoid parallel changes to page->private. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 119ced4a501b..092ccb811eac 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3102,6 +3102,15 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { + /* + * If the page is mapped to btree inode, we should hold the private + * lock to prevent race. + * For cloned or dummy extent buffers, their pages are not mapped and + * will not race with any other ebs. + */ + if (page->mapping) + lockdep_assert_held(&page->mapping->private_lock); + if (!PagePrivate(page)) attach_page_private(page, eb); else -- cgit v1.2.3 From 478ef8868ff80372e29d1c5283f360cf49ab0a8b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 21 Oct 2020 14:25:05 +0800 Subject: btrfs: make buffer_radix take sector size units For subpage sector size support, one page can contain multiple tree blocks. The entries cannot be based on page size and index must be derived from the sectorsize. No change for page size == sector size. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent_io.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2aa3d882aede..591151266372 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -912,6 +912,7 @@ struct btrfs_fs_info { /* Extent buffer radix tree */ spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 092ccb811eac..1dfbe859c650 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5107,7 +5107,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, rcu_read_lock(); eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> PAGE_SHIFT); + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); /* @@ -5159,7 +5159,7 @@ again: } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5267,7 +5267,7 @@ again: spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5323,7 +5323,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_lock(&fs_info->buffer_lock); radix_tree_delete(&fs_info->buffer_radix, - eb->start >> PAGE_SHIFT); + eb->start >> fs_info->sectorsize_bits); spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); -- cgit v1.2.3 From 2f4d60dfae0ee4ad1c8d57e102c3b032b8f9d4d5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:30:46 +0800 Subject: btrfs: grab fs_info from extent_buffer in btrfs_mark_buffer_dirty Since commit f28491e0a6c4 ("Btrfs: move the extent buffer radix tree into the fs_info"), fs_info can be grabbed from extent_buffer directly. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 212806d59012..abc929a47d4b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4187,8 +4187,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); int was_dirty; @@ -4201,8 +4200,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - root = BTRFS_I(buf->pages[0]->mapping->host)->root; - fs_info = root->fs_info; btrfs_assert_tree_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", -- cgit v1.2.3 From a26663e7a2f456b8111de0135394c04c72831930 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:30:47 +0800 Subject: btrfs: make csum_tree_block() handle node smaller than page For subpage size support, metadata blocks of nodesize are smaller than one page and this needs to be handled when calculating the checksum. The checksummed start and length need to be adjusted but only for the first page: - start is simply offset in the page - length is nodesize (subpage) or PAGE_SIZE for all other cases Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abc929a47d4b..1c8c94dbe984 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -212,15 +212,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; const int num_pages = fs_info->nodesize >> PAGE_SHIFT; + const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]); + kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - PAGE_SIZE - BTRFS_CSUM_SIZE); + first_page_part - BTRFS_CSUM_SIZE); for (i = 1; i < num_pages; i++) { kaddr = page_address(buf->pages[i]); -- cgit v1.2.3 From 77bf40a2ba2e6eded53a74f94d1be448bd8db030 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:30:48 +0800 Subject: btrfs: extract extent buffer verification from btrfs_validate_metadata_buffer() Currently btrfs_validate_metadata_buffer() only needs to handle one extent buffer as currently one page maps to at most one extent buffer. For incoming subpage support, we need to extend the support where one page could contain multiple extent buffers. Split the function so we can call validate_extent_buffer on extent buffers independently. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 78 +++++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c8c94dbe984..de18055bede4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -523,60 +523,35 @@ static int check_tree_block_fsid(struct extent_buffer *eb) return 1; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror) +/* Do basic extent buffer checks at read time */ +static int validate_extent_buffer(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; - int found_level; - struct extent_buffer *eb; - struct btrfs_fs_info *fs_info; - u16 csum_size; - int ret = 0; + const u32 csum_size = fs_info->csum_size; + u8 found_level; u8 result[BTRFS_CSUM_SIZE]; - int reads_done; - - if (!page->private) - goto out; - - eb = (struct extent_buffer *)page->private; - fs_info = eb->fs_info; - csum_size = fs_info->csum_size; - - /* the pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } + int ret = 0; found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", eb->start, found_start); ret = -EIO; - goto err; + goto out; } if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; - goto err; + goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "bad tree block level %d on %llu", (int)btrfs_header_level(eb), eb->start); ret = -EIO; - goto err; + goto out; } btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), @@ -595,7 +570,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; - goto err; + goto out; } /* @@ -617,6 +592,37 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, btrfs_err(fs_info, "block=%llu read time tree block corruption detected", eb->start); +out: + return ret; +} + +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) +{ + struct extent_buffer *eb; + int ret = 0; + int reads_done; + + ASSERT(page->private); + eb = (struct extent_buffer *)page->private; + + /* + * The pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + atomic_inc(&eb->refs); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -632,7 +638,7 @@ err: clear_extent_buffer_uptodate(eb); } free_extent_buffer(eb); -out: + return ret; } -- cgit v1.2.3 From ac303b6987a9633ef11447a861d24752387dbdfc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:30:49 +0800 Subject: btrfs: pass bvec to csum_dirty_buffer instead of page Currently csum_dirty_buffer() uses page to grab extent buffer, but that only works for sector size == PAGE_SIZE case. For subpage we need page + page_offset to grab extent buffer. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index de18055bede4..3edbbbcbe75a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -444,12 +444,13 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, } /* - * checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block + * Checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block. + * For subpage extent buffers we need bvec to also read the offset in the page. */ - -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) { + struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; @@ -791,7 +792,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec); if (ret) break; } -- cgit v1.2.3 From 261d2dcb24302b220281f989d978268310a31bed Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:31:01 +0800 Subject: btrfs: scrub: distinguish scrub page from regular page There are several call sites where we declare something like "struct scrub_page *page". This is confusing as we also use regular page in this code, rename it to 'spage' where applicable. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 102 +++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 58cd3278fbfe..42d1d5258e83 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -255,10 +255,10 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *page) +static inline int scrub_is_page_on_raid56(struct scrub_page *spage) { - return page->recover && - (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return spage->recover && + (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -1090,11 +1090,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_block *sblock_other = NULL; /* skip no-io-error page in scrub */ - if (!page_bad->io_error && !sctx->is_dev_replace) + if (!spage_bad->io_error && !sctx->is_dev_replace) continue; if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { @@ -1106,7 +1106,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (page_bad->io_error) { + } else if (spage_bad->io_error) { /* try to find no-io-error page in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && @@ -1145,7 +1145,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other, page_num, 0); if (0 == ret) - page_bad->io_error = 0; + spage_bad->io_error = 0; else success = 0; } @@ -1323,13 +1323,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *page; + struct scrub_page *spage; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; - page = kzalloc(sizeof(*page), GFP_NOFS); - if (!page) { + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1337,15 +1337,15 @@ leave_nomem: scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(page); - sblock->pagev[page_index] = page; - page->sblock = sblock; - page->flags = flags; - page->generation = generation; - page->logical = logical; - page->have_csum = have_csum; + scrub_page_get(spage); + sblock->pagev[page_index] = spage; + spage->sblock = sblock; + spage->flags = flags; + spage->generation = generation; + spage->logical = logical; + spage->have_csum = have_csum; if (have_csum) - memcpy(page->csum, + memcpy(spage->csum, original_sblock->pagev[0]->csum, sctx->fs_info->csum_size); @@ -1358,23 +1358,23 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - page->physical = bbio->stripes[stripe_index].physical + + spage->physical = bbio->stripes[stripe_index].physical + stripe_offset; - page->dev = bbio->stripes[stripe_index].dev; + spage->dev = bbio->stripes[stripe_index].dev; BUG_ON(page_index >= original_sblock->page_count); - page->physical_for_dev_replace = + spage->physical_for_dev_replace = original_sblock->pagev[page_index]-> physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ - page->mirror_num = mirror_index + 1; + spage->mirror_num = mirror_index + 1; sblock->page_count++; - page->page = alloc_page(GFP_NOFS); - if (!page->page) + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) goto leave_nomem; scrub_get_recover(recover); - page->recover = recover; + spage->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; @@ -1392,19 +1392,19 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *page) + struct scrub_page *spage) { DECLARE_COMPLETION_ONSTACK(done); int ret; int mirror_num; - bio->bi_iter.bi_sector = page->logical >> 9; + bio->bi_iter.bi_sector = spage->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - mirror_num = page->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, - page->recover->map_length, + mirror_num = spage->sblock->pagev[0]->mirror_num; + ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio, + spage->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -1429,10 +1429,10 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *page = sblock->pagev[page_num]; + struct scrub_page *spage = sblock->pagev[page_num]; - WARN_ON(!page->page); - bio_add_page(bio, page->page, PAGE_SIZE, 0); + WARN_ON(!spage->page); + bio_add_page(bio, spage->page, PAGE_SIZE, 0); } if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { @@ -1473,24 +1473,24 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; - struct scrub_page *page = sblock->pagev[page_num]; + struct scrub_page *spage = sblock->pagev[page_num]; - if (page->dev->bdev == NULL) { - page->io_error = 1; + if (spage->dev->bdev == NULL) { + spage->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!page->page); + WARN_ON(!spage->page); bio = btrfs_io_bio_alloc(1); - bio_set_dev(bio, page->dev->bdev); + bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, page->page, PAGE_SIZE, 0); - bio->bi_iter.bi_sector = page->physical >> 9; + bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; if (btrfsic_submit_bio_wait(bio)) { - page->io_error = 1; + spage->io_error = 1; sblock->no_io_error_seen = 0; } @@ -1546,36 +1546,36 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write) { - struct scrub_page *page_bad = sblock_bad->pagev[page_num]; - struct scrub_page *page_good = sblock_good->pagev[page_num]; + struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; - BUG_ON(page_bad->page == NULL); - BUG_ON(page_good->page == NULL); + BUG_ON(spage_bad->page == NULL); + BUG_ON(spage_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || page_bad->io_error) { + sblock_bad->checksum_error || spage_bad->io_error) { struct bio *bio; int ret; - if (!page_bad->dev->bdev) { + if (!spage_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(1); - bio_set_dev(bio, page_bad->dev->bdev); - bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_dev(bio, spage_bad->dev->bdev); + bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { bio_put(bio); return -EIO; } if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(page_bad->dev, + btrfs_dev_stat_inc_and_print(spage_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); -- cgit v1.2.3 From 96e63a45fb9a40ba49813c1e538358f3cedbedba Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:31:02 +0800 Subject: btrfs: scrub: remove the force parameter from scrub_pages The @force parameter for scrub_pages() is to indicate whether we want to force bio submission. Currently it's only used for the super block, and it can be easily determined by the @flags, so we can remove the parameter. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 42d1d5258e83..7ca11ea001a1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -236,7 +236,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force, + u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); @@ -2150,7 +2150,7 @@ bbio_out: static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force, + u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) { struct scrub_block *sblock; @@ -2229,7 +2229,7 @@ leave_nomem: } } - if (force) + if (flags & BTRFS_EXTENT_FLAG_SUPER) scrub_submit(sctx); } @@ -2441,7 +2441,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, ++sctx->stat.no_csum; } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, 0, + mirror_num, have_csum ? csum : NULL, physical_for_dev_replace); if (ret) return ret; @@ -3710,7 +3710,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, 1, bytenr); + NULL, bytenr); if (ret) return ret; } -- cgit v1.2.3 From 480a8ec83b179da1c484133b0f687090e89b00c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Nov 2020 21:31:04 +0800 Subject: btrfs: scrub: refactor scrub_find_csum() Function scrub_find_csum() is to locate the csum for bytenr @logical from sctx->csum_list. However it lacks a lot of comments to explain things like how the csum_list is organized and why we need to drop csum range which is before us. Refactor the function by: - Add more comments explaining the behavior - Add comment explaining why we need to drop the csum range - Put the csum copy in the main loop This is mostly for the incoming patches to make scrub_find_csum() able to find multiple checksums. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 67 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7ca11ea001a1..57ee06d92150 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2362,38 +2362,65 @@ static void scrub_block_complete(struct scrub_block *sblock) } } +static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) +{ + sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; + list_del(&sum->list); + kfree(sum); +} + +/* + * Find the desired csum for range [logical, logical + sectorsize), and store + * the csum into @csum. + * + * The search source is sctx->csum_list, which is a pre-populated list + * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * that is before @logical. + * + * Return 0 if there is no csum for the range. + * Return 1 if there is csum for the range and copied to @csum. + */ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) { - struct btrfs_ordered_sum *sum = NULL; - unsigned long index; - unsigned long num_sectors; + bool found = false; while (!list_empty(&sctx->csum_list)) { + struct btrfs_ordered_sum *sum = NULL; + unsigned long index; + unsigned long num_sectors; + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); + /* The current csum range is beyond our range, no csum found */ if (sum->bytenr > logical) - return 0; - if (sum->bytenr + sum->len > logical) break; - ++sctx->stat.csum_discards; - list_del(&sum->list); - kfree(sum); - sum = NULL; - } - if (!sum) - return 0; + /* + * The current sum is before our bytenr, since scrub is always + * done in bytenr order, the csum will never be used anymore, + * clean it up so that later calls won't bother with the range, + * and continue search the next range. + */ + if (sum->bytenr + sum->len <= logical) { + drop_csum_range(sctx, sum); + continue; + } - index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; - ASSERT(index < UINT_MAX); + /* Now the csum range covers our bytenr, copy the csum */ + found = true; + index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; + num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, - sctx->fs_info->csum_size); - if (index == num_sectors - 1) { - list_del(&sum->list); - kfree(sum); + memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, + sctx->fs_info->csum_size); + + /* Cleanup the range if we're at the end of the csum range */ + if (index == num_sectors - 1) + drop_csum_range(sctx, sum); + break; } + if (!found) + return 0; return 1; } -- cgit v1.2.3 From e50404a8a6997f9c0d412fa21f07a0da8e3891a2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Nov 2020 09:45:51 +0000 Subject: btrfs: discard: speed up async discard up to iops_limit Instead of using iops_limit only for cutting off extremes, calculate the discard delay directly from it, so it closely follows iops_limit and doesn't under-discard even though quotas are not saturated. The iops limit could be hit more often in some cases and could increase the discard rate. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 5a88b584276f..91afa43520dc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -519,7 +519,6 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s64 discardable_bytes; u32 iops_limit; unsigned long delay; - unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC; discardable_extents = atomic_read(&discard_ctl->discardable_extents); if (!discardable_extents) @@ -550,11 +549,12 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) iops_limit = READ_ONCE(discard_ctl->iops_limit); if (iops_limit) - lower_limit = max_t(unsigned long, lower_limit, - MSEC_PER_SEC / iops_limit); + delay = MSEC_PER_SEC / iops_limit; + else + delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; - delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC); + delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, + BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay = msecs_to_jiffies(delay); spin_unlock(&discard_ctl->lock); -- cgit v1.2.3 From 6e88f116bd4cf34406fc70a6d6bf5b4d49e1ab2d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Nov 2020 09:45:52 +0000 Subject: btrfs: discard: store async discard delay as ns not as jiffies Most delay calculations are done in ns or ms, so store discard_ctl->delay in ms and convert the final delay to jiffies only at the end. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/discard.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 591151266372..fbbfe8afa980 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -472,7 +472,7 @@ struct btrfs_discard_ctl { atomic_t discardable_extents; atomic64_t discardable_bytes; u64 max_discard_size; - unsigned long delay; + u64 delay_ms; u32 iops_limit; u32 kbps_limit; u64 discard_extent_bytes; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 91afa43520dc..819eaf506fcc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -355,7 +355,7 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, block_group = find_next_block_group(discard_ctl, now); if (block_group) { - unsigned long delay = discard_ctl->delay; + u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); /* @@ -366,9 +366,9 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, if (kbps_limit && discard_ctl->prev_discard) { u64 bps_limit = ((u64)kbps_limit) * SZ_1K; u64 bps_delay = div64_u64(discard_ctl->prev_discard * - MSEC_PER_SEC, bps_limit); + NSEC_PER_SEC, bps_limit); - delay = max(delay, msecs_to_jiffies(bps_delay)); + delay = max(delay, bps_delay); } /* @@ -378,11 +378,11 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, if (now < block_group->discard_eligible_time) { u64 bg_timeout = block_group->discard_eligible_time - now; - delay = max(delay, nsecs_to_jiffies(bg_timeout)); + delay = max(delay, bg_timeout); } mod_delayed_work(discard_ctl->discard_workers, - &discard_ctl->work, delay); + &discard_ctl->work, nsecs_to_jiffies(delay)); } out: spin_unlock(&discard_ctl->lock); @@ -555,7 +555,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, BTRFS_DISCARD_MAX_DELAY_MSEC); - discard_ctl->delay = msecs_to_jiffies(delay); + discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); } @@ -688,7 +688,7 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info) atomic_set(&discard_ctl->discardable_extents, 0); atomic64_set(&discard_ctl->discardable_bytes, 0); discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; - discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC; + discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; discard_ctl->kbps_limit = 0; discard_ctl->discard_extent_bytes = 0; -- cgit v1.2.3 From df903e5d294f62e07280566e0afe9403a92879f0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Nov 2020 09:45:53 +0000 Subject: btrfs: don't miss async discards after scheduled work override If btrfs_discard_schedule_work() is called with override=true, it sets delay anew regardless how much time is left until the timer should have fired. If delays are long (that can happen, for example, with low kbps_limit), they might get constantly overridden without having a chance to run the discard work. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/discard.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fbbfe8afa980..461d1d52aaba 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -469,6 +469,7 @@ struct btrfs_discard_ctl { struct btrfs_block_group *block_group; struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; u64 prev_discard; + u64 prev_discard_time; atomic_t discardable_extents; atomic64_t discardable_bytes; u64 max_discard_size; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 819eaf506fcc..1db966bf85b2 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -381,6 +381,15 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, delay = max(delay, bg_timeout); } + if (override && discard_ctl->prev_discard) { + u64 elapsed = now - discard_ctl->prev_discard_time; + + if (delay > elapsed) + delay -= elapsed; + else + delay = 0; + } + mod_delayed_work(discard_ctl->discard_workers, &discard_ctl->work, nsecs_to_jiffies(delay)); } @@ -465,7 +474,12 @@ static void btrfs_discard_workfn(struct work_struct *work) discard_ctl->discard_extent_bytes += trimmed; } + /* + * Updated without locks as this is inside the workfn and nothing else + * is reading the values + */ discard_ctl->prev_discard = trimmed; + discard_ctl->prev_discard_time = ktime_get_ns(); /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { @@ -685,6 +699,7 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&discard_ctl->discard_list[i]); discard_ctl->prev_discard = 0; + discard_ctl->prev_discard_time = 0; atomic_set(&discard_ctl->discardable_extents, 0); atomic64_set(&discard_ctl->discardable_bytes, 0); discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; -- cgit v1.2.3 From 3e48d8d2540d4c63461ec4cafb8f65355b6f7b57 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Nov 2020 09:45:54 +0000 Subject: btrfs: discard: reschedule work after sysfs param update After sysfs updates discard's iops_limit or kbps_limit it also needs to adjust current timer through rescheduling, otherwise the discard work may wait for a long time for the previous timer to expire or bumped by someone else. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 620fddfd7096..640d17f6ace1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -458,7 +458,8 @@ static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, return -EINVAL; WRITE_ONCE(discard_ctl->iops_limit, iops_limit); - + btrfs_discard_calc_delay(discard_ctl); + btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, @@ -488,7 +489,7 @@ static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, return -EINVAL; WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); - + btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, -- cgit v1.2.3 From 416e3445ef8087ff28792f366af9726cc225fb0c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:08 -0500 Subject: btrfs: remove lockdep classes for the fs tree We have this weird problem where our lockdep class is set after we read a tree block, which can race with concurrent readers and result in erroneous lockdep errors. We want to set the lockdep class at allocation time if possible, but in certain cases we may not have the actual root owner, such as with relocation or any backref lookups. This is only really a problem for reference counted trees, because all other trees have their root reference set in their extent reference. Remove the fs tree specific lock class. We need to still keep the reloc tree one, it's still reference counted, because replace_path will lock the reloc tree and the destination tree, and if they're both set to tree- we'll have issues. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3edbbbcbe75a..9f0d30ea535d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -173,7 +173,6 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, - { .id = BTRFS_FS_TREE_OBJECTID, DEFINE_NAME("fs") }, { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, -- cgit v1.2.3 From bfb484d922a317183d77b3b6db77a2ff659384cc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:09 -0500 Subject: btrfs: cleanup extent buffer readahead We're going to pass around more information when we allocate extent buffers, in order to make that cleaner how we do readahead. Most of the callers have the parent node that we're getting our blockptr from, with the sole exception of relocation which simply has the bytenr it wants to read. Add a helper that takes the current arguments that we need (bytenr and gen), and add another helper for simply reading the slot out of a node. In followup patches the helper that takes all the extra arguments will be expanded, and the simpler helper won't need to have it's arguments adjusted. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 45 +++++++++------------------------------------ fs/btrfs/disk-io.c | 16 ---------------- fs/btrfs/disk-io.h | 1 - fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 3 +++ fs/btrfs/relocation.c | 2 +- fs/btrfs/volumes.c | 8 ++------ 8 files changed, 63 insertions(+), 61 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d2d5854d51a7..0ff866328a4f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2226,7 +2226,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(fs_info, search); + btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; @@ -2235,16 +2235,11 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } -static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_path *path, int level) { + struct extent_buffer *parent; int slot; int nritems; - struct extent_buffer *parent; - struct extent_buffer *eb; - u64 gen; - u64 block1 = 0; - u64 block2 = 0; parent = path->nodes[level + 1]; if (!parent) @@ -2253,32 +2248,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - if (slot > 0) { - block1 = btrfs_node_blockptr(parent, slot - 1); - gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = find_extent_buffer(fs_info, block1); - /* - * if we get -eagain from btrfs_buffer_uptodate, we - * don't want to return eagain here. That will loop - * forever - */ - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block1 = 0; - free_extent_buffer(eb); - } - if (slot + 1 < nritems) { - block2 = btrfs_node_blockptr(parent, slot + 1); - gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = find_extent_buffer(fs_info, block2); - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block2 = 0; - free_extent_buffer(eb); - } - - if (block1) - readahead_tree_block(fs_info, block1); - if (block2) - readahead_tree_block(fs_info, block2); + if (slot > 0) + btrfs_readahead_node_child(parent, slot - 1); + if (slot + 1 < nritems) + btrfs_readahead_node_child(parent, slot + 1); } @@ -2454,7 +2427,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - reada_for_balance(fs_info, p, level); + reada_for_balance(p, level); sret = split_node(trans, root, p, level); BUG_ON(sret > 0); @@ -2473,7 +2446,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - reada_for_balance(fs_info, p, level); + reada_for_balance(p, level); sret = balance_level(trans, root, p, level); if (sret) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f0d30ea535d..012e66601270 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -953,22 +953,6 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return; - - ret = read_extent_buffer_pages(buf, WAIT_NONE, 0); - if (ret < 0) - free_extent_buffer_stale(buf); - else - free_extent_buffer(buf); -} - struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 238b45223f2e..009f505d6c97 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,7 +45,6 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); -void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5c82cfdb0944..5d54819aada7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4859,7 +4859,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(fs_info, bytenr); + btrfs_readahead_node_child(eb, slot); nread++; } wc->reada_slot = slot; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1dfbe859c650..973254b1f216 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6123,3 +6123,50 @@ int try_release_extent_buffer(struct page *page) return release_extent_buffer(eb); } + +/* + * btrfs_readahead_tree_block - attempt to readahead a child block + * @fs_info: the fs_info + * @bytenr: bytenr to read + * @gen: generation for the uptodate check, can be 0 + * + * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a + * normal uptodate check of the eb, without checking the generation. If we have + * to read the block we will not block on anything. + */ +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 gen) +{ + struct extent_buffer *eb; + int ret; + + eb = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(eb)) + return; + + if (btrfs_buffer_uptodate(eb, gen, 1)) { + free_extent_buffer(eb); + return; + } + + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(eb); + else + free_extent_buffer(eb); +} + +/* + * btrfs_readahead_node_child - readahead a node's child block + * @node: parent node we're reading from + * @slot: slot in the parent node for the child we want to read + * + * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at + * the slot in the node provided. + */ +void btrfs_readahead_node_child(struct extent_buffer *node, int slot) +{ + btrfs_readahead_tree_block(node->fs_info, + btrfs_node_blockptr(node, slot), + btrfs_node_ptr_generation(node, slot)); +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3c2bf21c54eb..a211e90292f8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -198,6 +198,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 gen); +void btrfs_readahead_node_child(struct extent_buffer *node, int slot); static inline int num_extent_pages(const struct extent_buffer *eb) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0b3ccf464c3d..415332a303ec 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2542,7 +2542,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - readahead_tree_block(fs_info, block->bytenr); + btrfs_readahead_tree_block(fs_info, block->bytenr, 0); } /* Get first keys */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ed4163b54f75..814abc0b3ff3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7071,12 +7071,8 @@ static void readahead_tree_node_children(struct extent_buffer *node) int i; const int nr_items = btrfs_header_nritems(node); - for (i = 0; i < nr_items; i++) { - u64 start; - - start = btrfs_node_blockptr(node, i); - readahead_tree_block(node->fs_info, start); - } + for (i = 0; i < nr_items; i++) + btrfs_readahead_node_child(node, i); } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 206983b72a369c8fdc4fd55b3f46ec16f3c024ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:10 -0500 Subject: btrfs: use btrfs_read_node_slot in btrfs_realloc_node We have this open-coded nightmare in btrfs_realloc_node that does the same thing that the normal read path does, which is to see if we have the eb in memory already, and if not read it, and verify the eb is uptodate. Delete this open coding and simply use btrfs_read_node_slot. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0ff866328a4f..467f90af5563 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1570,7 +1570,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; u64 blocknr; - u64 gen; u64 search_start = *last_ret; u64 last_block = 0; u64 other; @@ -1578,14 +1577,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int end_slot; int i; int err = 0; - int parent_level; - int uptodate; u32 blocksize; int progress_passed = 0; struct btrfs_disk_key disk_key; - parent_level = btrfs_header_level(parent); - WARN_ON(trans->transaction != fs_info->running_transaction); WARN_ON(trans->transid != fs_info->generation); @@ -1597,7 +1592,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return 0; for (i = start_slot; i <= end_slot; i++) { - struct btrfs_key first_key; int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1606,8 +1600,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, progress_passed = 1; blocknr = btrfs_node_blockptr(parent, i); - gen = btrfs_node_ptr_generation(parent, i); - btrfs_node_key_to_cpu(parent, &first_key, i); if (last_block == 0) last_block = blocknr; @@ -1624,31 +1616,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = find_extent_buffer(fs_info, blocknr); - if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen, 0); - else - uptodate = 0; - if (!cur || !uptodate) { - if (!cur) { - cur = read_tree_block(fs_info, blocknr, gen, - parent_level - 1, - &first_key); - if (IS_ERR(cur)) { - return PTR_ERR(cur); - } else if (!extent_buffer_uptodate(cur)) { - free_extent_buffer(cur); - return -EIO; - } - } else if (!uptodate) { - err = btrfs_read_buffer(cur, gen, - parent_level - 1,&first_key); - if (err) { - free_extent_buffer(cur); - return err; - } - } - } + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); if (search_start == 0) search_start = last_block; -- cgit v1.2.3 From 8ef385bbf0994ecf658e050ccb58d7fc18920935 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:11 -0500 Subject: btrfs: use btrfs_read_node_slot in walk_down_reloc_tree We do not need to call read_tree_block() here, simply use the btrfs_read_node_slot helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 415332a303ec..91ed88ef0931 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1415,10 +1415,8 @@ static noinline_for_stack int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *eb = NULL; int i; - u64 bytenr; u64 ptr_gen = 0; u64 last_snapshot; u32 nritems; @@ -1426,8 +1424,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = *level; i > 0; i--) { - struct btrfs_key first_key; - eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] < nritems) { @@ -1447,16 +1443,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, return 0; } - bytenr = btrfs_node_blockptr(eb, path->slots[i]); - btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); - eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, - &first_key); - if (IS_ERR(eb)) { + eb = btrfs_read_node_slot(eb, path->slots[i]); + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - return -EIO; - } BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; -- cgit v1.2.3 From c975253682e049a3a98e9ce27d19c923f1d1d776 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:12 -0500 Subject: btrfs: use btrfs_read_node_slot in do_relocation We're open coding btrfs_read_node_slot in do_relocation, replace this with the proper helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 91ed88ef0931..57efb903016f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2191,7 +2191,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_path *path, int lowest) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; @@ -2199,7 +2198,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, struct extent_buffer *eb; u32 blocksize; u64 bytenr; - u64 generation; int slot; int ret; int err = 0; @@ -2209,7 +2207,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; cond_resched(); @@ -2282,17 +2279,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, } blocksize = root->fs_info->nodesize; - generation = btrfs_node_ptr_generation(upper->eb, slot); - btrfs_node_key_to_cpu(upper->eb, &first_key, slot); - eb = read_tree_block(fs_info, bytenr, generation, - upper->level - 1, &first_key); + eb = btrfs_read_node_slot(upper->eb, slot); if (IS_ERR(eb)) { err = PTR_ERR(eb); goto next; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - err = -EIO; - goto next; } btrfs_tree_lock(eb); -- cgit v1.2.3 From 6b3426be27de80ed213e6c901ae566f478aeadaa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:13 -0500 Subject: btrfs: use btrfs_read_node_slot in replace_path We're open-coding btrfs_read_node_slot() here, replace with the helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 57efb903016f..9f80f5352dfd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1218,8 +1218,6 @@ again: parent = eb; while (1) { - struct btrfs_key first_key; - level = btrfs_header_level(parent); BUG_ON(level < lowest_level); @@ -1235,7 +1233,6 @@ again: old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(parent, &first_key, slot); if (level <= max_level) { eb = path->nodes[level]; @@ -1260,15 +1257,10 @@ again: break; } - eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, - level - 1, &first_key); + eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; - } else if (!extent_buffer_uptodate(eb)) { - ret = -EIO; - free_extent_buffer(eb); - break; } btrfs_tree_lock(eb); if (cow) { -- cgit v1.2.3 From c990ada2a0bb780ba21f18abac5a1e766e40de0b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:14 -0500 Subject: btrfs: use btrfs_read_node_slot in walk_down_tree We're open-coding btrfs_read_node_slot() here, replace with the helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 488bc3dd3c2b..4b9b6c52a83b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -551,29 +551,15 @@ static int process_leaf(struct btrfs_root *root, static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int level, u64 *bytenr, u64 *num_bytes) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *eb; - u64 block_bytenr, gen; int ret = 0; while (level >= 0) { if (level) { - struct btrfs_key first_key; - - block_bytenr = btrfs_node_blockptr(path->nodes[level], - path->slots[level]); - gen = btrfs_node_ptr_generation(path->nodes[level], - path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, - path->slots[level]); - eb = read_tree_block(fs_info, block_bytenr, gen, - level - 1, &first_key); + eb = btrfs_read_node_slot(path->nodes[level], + path->slots[level]); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - return -EIO; - } btrfs_tree_read_lock(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; -- cgit v1.2.3 From 6b2cb7cb959a72670973d70c1f36352f6f60042a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:15 -0500 Subject: btrfs: use btrfs_read_node_slot in qgroup_trace_extent_swap We're open-coding btrfs_read_node_slot() here, replace with the helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 78ffbad9b03d..db6165e130c5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1942,27 +1942,16 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct btrfs_key dst_key; if (src_path->nodes[cur_level] == NULL) { - struct btrfs_key first_key; struct extent_buffer *eb; int parent_slot; - u64 child_gen; - u64 child_bytenr; eb = src_path->nodes[cur_level + 1]; parent_slot = src_path->slots[cur_level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); - eb = read_tree_block(fs_info, child_bytenr, child_gen, - cur_level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } src_path->nodes[cur_level] = eb; -- cgit v1.2.3 From 3acfbd6a990c6c78e333dd3b37bbe20da289a382 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:16 -0500 Subject: btrfs: use btrfs_read_node_slot in qgroup_trace_new_subtree_blocks We're open-coding btrfs_read_node_slot() here, replace with the helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index db6165e130c5..1f706035d7b9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2052,10 +2052,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* Read the tree block if needed */ if (dst_path->nodes[cur_level] == NULL) { - struct btrfs_key first_key; int parent_slot; u64 child_gen; - u64 child_bytenr; /* * dst_path->nodes[root_level] must be initialized before @@ -2074,23 +2072,16 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, */ eb = dst_path->nodes[cur_level + 1]; parent_slot = dst_path->slots[cur_level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); /* This node is old, no need to trace */ if (child_gen < last_snapshot) goto out; - eb = read_tree_block(fs_info, child_bytenr, child_gen, - cur_level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } dst_path->nodes[cur_level] = eb; -- cgit v1.2.3 From 182c79fcb8576548515250b99defce7505c71a49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:17 -0500 Subject: btrfs: use btrfs_read_node_slot in btrfs_qgroup_trace_subtree We're open-coding btrfs_read_node_slot() here, replace with the helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1f706035d7b9..9ba1e62a6533 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2232,30 +2232,21 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { - struct btrfs_key first_key; int parent_slot; - u64 child_gen; u64 child_bytenr; /* - * We need to get child blockptr/gen from parent before - * we can read it. + * We need to get child blockptr from parent before we + * can read it. */ eb = path->nodes[level + 1]; parent_slot = path->slots[level + 1]; child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); - eb = read_tree_block(fs_info, child_bytenr, child_gen, - level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } path->nodes[level] = eb; -- cgit v1.2.3 From 1b7ec85ef49057898a48b2ca1a1e33bf7c27abbe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:18 -0500 Subject: btrfs: pass root owner to read_tree_block In order to properly set the lockdep class of a newly allocated block we need to know the owner of the block. For non-refcounted trees this is straightforward, we always know in advance what tree we're reading from. For refcounted trees we don't necessarily know, however all refcounted trees share the same lockdep class name, tree-. Fix all the callers of read_tree_block() to pass in the root objectid we're using. In places like relocation and backref we could probably unconditionally use 0, but just in case use the root when we have it, otherwise use 0 in the cases we don't have the root as it's going to be a refcounted tree anyway. This is a preparation patch for further changes. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 +++--- fs/btrfs/ctree.c | 8 +++++--- fs/btrfs/disk-io.c | 14 +++++++++----- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/print-tree.c | 1 + fs/btrfs/qgroup.c | 2 +- fs/btrfs/relocation.c | 4 ++-- 8 files changed, 25 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 86abe34e9444..02d7d7b2563b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -783,8 +783,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0, - ref->level - 1, NULL); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, + ref->root_id, 0, ref->level - 1, NULL); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -1331,7 +1331,7 @@ again: struct extent_buffer *eb; eb = read_tree_block(fs_info, ref->parent, 0, - ref->level, NULL); + 0, ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 467f90af5563..e2673c29e733 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1353,7 +1353,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, 0, level, NULL); + old = read_tree_block(fs_info, logical, root->root_key.objectid, + 0, level, NULL); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); @@ -1760,6 +1761,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_node_key_to_cpu(parent, &first_key, slot); eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), + btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { @@ -2349,8 +2351,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, level, slot, key->objectid); ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, - &first_key); + tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, + gen, parent_level - 1, &first_key); if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 012e66601270..763121380942 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -966,13 +966,14 @@ struct extent_buffer *btrfs_find_create_tree_block( * Read tree block at logical address @bytenr and do variant basic but critical * verification. * + * @owner_root: the objectid of the root owner for this block. * @parent_transid: expected transid of this tree block, skip check if 0 * @level: expected level, mandatory check * @first_key: expected key in slot 0, skip check if NULL */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid, int level, - struct btrfs_key *first_key) + u64 owner_root, u64 parent_transid, + int level, struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; @@ -1295,7 +1296,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation, level, NULL); + key->objectid, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -2249,8 +2250,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -ENOMEM; log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1, - level, NULL); + BTRFS_TREE_LOG_OBJECTID, + fs_info->generation + 1, level, + NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2636,6 +2638,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) generation = btrfs_super_generation(sb); level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), + BTRFS_ROOT_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(tree_root->node)) { handle_error = true; @@ -3124,6 +3127,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), + BTRFS_CHUNK_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 009f505d6c97..41588babf2ed 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -43,8 +43,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid, int level, - struct btrfs_key *first_key); + u64 owner_root, u64 parent_transid, + int level, struct btrfs_key *first_key); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5d54819aada7..90f1ec0802bd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5077,8 +5077,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation, level - 1, - &first_key); + next = read_tree_block(fs_info, bytenr, root->root_key.objectid, + generation, level - 1, &first_key); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 8db99117531d..fe5e0026129d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -390,6 +390,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_node_key_to_cpu(c, &first_key, i); next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), + btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); if (IS_ERR(next)) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9ba1e62a6533..fe3046007f52 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4208,7 +4208,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, spin_unlock(&blocks->lock); /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, block->reloc_generation, block->level, &block->first_key); if (IS_ERR(reloc_eb)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9f80f5352dfd..7ebada33502f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2413,7 +2413,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->key.offset, + eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); @@ -3038,7 +3038,7 @@ int add_data_references(struct reloc_control *rc, while ((ref_node = ulist_next(leaves, &leaf_uiter))) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); + eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; -- cgit v1.2.3 From 5d81230baa9096bd5a7ad40822505b89ca7f9dfe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:19 -0500 Subject: btrfs: pass the root owner and level around for readahead The readahead infrastructure does raw reads of extent buffers, but we're going to need to know their owner and level in order to set the lockdep key properly, so plumb in the infrastructure that we'll need to have this information when we start allocating extent buffers. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reada.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 6e33cb755fa5..b1bea0d59e13 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -52,6 +52,7 @@ struct reada_extctl { struct reada_extent { u64 logical; + u64 owner_root; struct btrfs_key top; struct list_head extctl; int refcnt; @@ -59,6 +60,7 @@ struct reada_extent { struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; int scheduled; + int level; }; struct reada_zone { @@ -87,7 +89,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info); static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 generation); + struct btrfs_key *top, u64 owner_root, + u64 generation, int level); /* recurses */ /* in case of err, eb might be NULL */ @@ -165,7 +168,9 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, if (rec->generation == generation && btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, n_gen); + reada_add_block(rc, bytenr, &next_key, + btrfs_header_owner(eb), n_gen, + btrfs_header_level(eb) - 1); } } @@ -298,7 +303,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_key *top) + struct btrfs_key *top, + u64 owner_root, int level) { int ret; struct reada_extent *re = NULL; @@ -331,6 +337,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); re->refcnt = 1; + re->owner_root = owner_root; + re->level = level; /* * map block @@ -548,14 +556,15 @@ static void reada_control_release(struct kref *kref) } static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 generation) + struct btrfs_key *top, u64 owner_root, + u64 generation, int level) { struct btrfs_fs_info *fs_info = rc->fs_info; struct reada_extent *re; struct reada_extctl *rec; /* takes one ref */ - re = reada_find_extent(fs_info, logical, top); + re = reada_find_extent(fs_info, logical, top, owner_root, level); if (!re) return -1; @@ -947,6 +956,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int ret; + int level; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -969,9 +979,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, node = btrfs_root_node(root); start = node->start; generation = btrfs_header_generation(node); + level = btrfs_header_level(node); free_extent_buffer(node); - ret = reada_add_block(rc, start, &max_key, generation); + ret = reada_add_block(rc, start, &max_key, root->root_key.objectid, + generation, level); if (ret) { kfree(rc); return ERR_PTR(ret); -- cgit v1.2.3 From 3fbaf25817f7013fad3ccf76279f0bd5719a5205 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:20 -0500 Subject: btrfs: pass the owner_root and level to alloc_extent_buffer Now that we've plumbed all of the callers to have the owner root and the level, plumb it down into alloc_extent_buffer(). Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ++++--- fs/btrfs/disk-io.h | 3 ++- fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/extent_io.c | 12 ++++++++---- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/reada.c | 8 +++++--- fs/btrfs/relocation.c | 3 ++- fs/btrfs/tree-log.c | 4 +++- fs/btrfs/volumes.c | 3 ++- 9 files changed, 31 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 763121380942..6abe3c8f025d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -955,11 +955,12 @@ static const struct address_space_operations btree_aops = { struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr) + u64 bytenr, u64 owner_root, + int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); - return alloc_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* @@ -978,7 +979,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return buf; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 41588babf2ed..e75ea6092942 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -47,7 +47,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, int level, struct btrfs_key *first_key); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr); + u64 bytenr, u64 owner_root, + int level); void btrfs_clean_tree_block(struct extent_buffer *buf); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 90f1ec0802bd..0d1b12e5b166 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4617,7 +4617,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; @@ -5018,7 +5018,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = find_extent_buffer(fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + root->root_key.objectid, level - 1); if (IS_ERR(next)) return PTR_ERR(next); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 973254b1f216..bd0d4d195d76 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5180,7 +5180,7 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; int num_pages; @@ -6128,19 +6128,21 @@ int try_release_extent_buffer(struct page *page) * btrfs_readahead_tree_block - attempt to readahead a child block * @fs_info: the fs_info * @bytenr: bytenr to read + * @owner_root: objectid of the root that owns this eb * @gen: generation for the uptodate check, can be 0 + * @level: level for the eb * * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a * normal uptodate check of the eb, without checking the generation. If we have * to read the block we will not block on anything. */ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 gen) + u64 bytenr, u64 owner_root, u64 gen, int level) { struct extent_buffer *eb; int ret; - eb = btrfs_find_create_tree_block(fs_info, bytenr); + eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(eb)) return; @@ -6168,5 +6170,7 @@ void btrfs_readahead_node_child(struct extent_buffer *node, int slot) { btrfs_readahead_tree_block(node->fs_info, btrfs_node_blockptr(node, slot), - btrfs_node_ptr_generation(node, slot)); + btrfs_header_owner(node), + btrfs_node_ptr_generation(node, slot), + btrfs_header_level(node) - 1); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a211e90292f8..c76697fc3120 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -182,7 +182,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u64 owner_root, int level); struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, @@ -199,7 +199,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 gen); + u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); static inline int num_extent_pages(const struct extent_buffer *eb) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b1bea0d59e13..20fd4aa48a8c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -656,12 +656,13 @@ static int reada_pick_zone(struct btrfs_device *dev) } static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) + u64 owner_root, int level, int mirror_num, + struct extent_buffer **eb) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return 0; @@ -749,7 +750,8 @@ static int reada_start_machine_dev(struct btrfs_device *dev) logical = re->logical; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, + re->level, mirror_num, &eb); if (ret) __readahead_hook(fs_info, re, NULL, ret); else if (eb) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7ebada33502f..c5774a8e6ff7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2513,7 +2513,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0); + btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + block->level); } /* Get first keys */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb86c632535a..b1f97be57bb3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2699,7 +2699,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); if (IS_ERR(next)) return PTR_ERR(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 814abc0b3ff3..cf9a48e27ab0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6903,7 +6903,8 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will * overallocate but we can keep it as-is, only the first page is used. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); + sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, + root->root_key.objectid, 0); if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); -- cgit v1.2.3 From e114c545bb699b2e97e8661d41f34a1651b43f50 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:21 -0500 Subject: btrfs: set the lockdep class for extent buffers on creation Both Filipe and Fedora QA recently hit the following lockdep splat: WARNING: possible recursive locking detected 5.10.0-0.rc1.20201028gited8780e3f2ec.57.fc34.x86_64 #1 Not tainted -------------------------------------------- rsync/2610 is trying to acquire lock: ffff89617ed48f20 (&eb->lock){++++}-{2:2}, at: btrfs_tree_read_lock_atomic+0x34/0x140 but task is already holding lock: ffff8961757b1130 (&eb->lock){++++}-{2:2}, at: btrfs_tree_read_lock_atomic+0x34/0x140 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&eb->lock); lock(&eb->lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by rsync/2610: #0: ffff896107212b90 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: walk_component+0x10c/0x190 #1: ffff8961757b1130 (&eb->lock){++++}-{2:2}, at: btrfs_tree_read_lock_atomic+0x34/0x140 stack backtrace: CPU: 1 PID: 2610 Comm: rsync Not tainted 5.10.0-0.rc1.20201028gited8780e3f2ec.57.fc34.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x8b/0xb0 __lock_acquire.cold+0x12d/0x2a4 ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 lock_acquire+0xc8/0x400 ? btrfs_tree_read_lock_atomic+0x34/0x140 ? read_block_for_search.isra.0+0xdd/0x320 _raw_read_lock+0x3d/0xa0 ? btrfs_tree_read_lock_atomic+0x34/0x140 btrfs_tree_read_lock_atomic+0x34/0x140 btrfs_search_slot+0x616/0x9a0 btrfs_lookup_dir_item+0x6c/0xb0 btrfs_lookup_dentry+0xa8/0x520 ? lockdep_init_map_waits+0x4c/0x210 btrfs_lookup+0xe/0x30 __lookup_slow+0x10f/0x1e0 walk_component+0x11b/0x190 path_lookupat+0x72/0x1c0 filename_lookup+0x97/0x180 ? strncpy_from_user+0x96/0x1e0 ? getname_flags.part.0+0x45/0x1a0 vfs_statx+0x64/0x100 ? lockdep_hardirqs_on_prepare+0xff/0x180 ? _raw_spin_unlock_irqrestore+0x41/0x50 __do_sys_newlstat+0x26/0x40 ? lockdep_hardirqs_on_prepare+0xff/0x180 ? syscall_enter_from_user_mode+0x27/0x80 ? syscall_enter_from_user_mode+0x27/0x80 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 I have also seen a report of lockdep complaining about the lock class that was looked up being the same as the lock class on the lock we were using, but I can't find the report. These are problems that occur because we do not have the lockdep class set on the extent buffer until _after_ we read the eb in properly. This is problematic for concurrent readers, because we will create the extent buffer, lock it, and then attempt to read the extent buffer. If a second thread comes in and tries to do a search down the same path they'll get the above lockdep splat because the class isn't set properly on the extent buffer. There was a good reason for this, we generally didn't know the real owner of the eb until we read it, specifically in refcounted roots. However now all refcounted roots have the same class name, so we no longer need to worry about this. For non-refcounted trees we know which root we're on based on the parent. Fix this by setting the lockdep class on the eb at creation time instead of read time. This will fix the splat and the weirdness where the class changes in the middle of locking the block. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/extent-tree.c | 8 +++++--- fs/btrfs/extent_io.c | 1 + fs/btrfs/volumes.c | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6abe3c8f025d..f0161d2ae2a4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -554,9 +554,6 @@ static int validate_extent_buffer(struct extent_buffer *eb) goto out; } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), - eb, found_level); - csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d1b12e5b166..56ea380f5a17 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4634,6 +4634,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } + /* + * This needs to stay, because we could allocate a freed block from an + * old tree into a new tree, so we need to make sure this new block is + * set to the appropriate level and owner. + */ btrfs_set_buffer_lockdep_class(owner, buf, level); __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); @@ -5022,9 +5027,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, root->root_key.objectid, level - 1); if (IS_ERR(next)) return PTR_ERR(next); - - btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, - level - 1); reada = 1; } btrfs_tree_lock(next); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bd0d4d195d76..ecca6d6ec90a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5205,6 +5205,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); + btrfs_set_buffer_lockdep_class(owner_root, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cf9a48e27ab0..998f25cddf2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6908,7 +6908,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's -- cgit v1.2.3 From 5893dfb98f257805b26e499a2d5d9190f2db7484 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:32 +0000 Subject: btrfs: refactor btrfs_drop_extents() to make it easier to extend There are many arguments for __btrfs_drop_extents() and its wrapper btrfs_drop_extents(), which makes it hard to add more arguments to it and requires changing every caller. I have added a couple myself back in 2014 commit 1acae57b161e ("Btrfs: faster file extent item replace operations") and therefore know firsthand that it is a bit cumbersome to add additional arguments to these functions. Since I will need to add more arguments in a subsequent bug fix, this change is preparatory work and adds a data structure that holds all the arguments, for both input and output, that are passed to this function, with some comments in the structure's definition mentioning what each field is and how it relates to other fields. Callers of this function need only to zero out the content of the structure and setup only the fields they need. This also removes the need to have both __btrfs_drop_extents() and btrfs_drop_extents(), so now we have a single function named btrfs_drop_extents() that takes a pointer to this new data structure (struct btrfs_drop_extents_args). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 63 ++++++++++++++++++--- fs/btrfs/file.c | 154 +++++++++++++++++++++++++++------------------------- fs/btrfs/inode.c | 41 +++++++++----- fs/btrfs/reflink.c | 7 ++- fs/btrfs/tree-log.c | 28 +++++++--- 5 files changed, 186 insertions(+), 107 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 461d1d52aaba..3071b0eccd82 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1233,6 +1233,58 @@ struct btrfs_replace_extent_info { int insertions; }; +/* Arguments for btrfs_drop_extents() */ +struct btrfs_drop_extents_args { + /* Input parameters */ + + /* + * If NULL, btrfs_drop_extents() will allocate and free its own path. + * If 'replace_extent' is true, this must not be NULL. Also the path + * is always released except if 'replace_extent' is true and + * btrfs_drop_extents() sets 'extent_inserted' to true, in which case + * the path is kept locked. + */ + struct btrfs_path *path; + /* Start offset of the range to drop extents from */ + u64 start; + /* End (exclusive, last byte + 1) of the range to drop extents from */ + u64 end; + /* If true drop all the extent maps in the range */ + bool drop_cache; + /* + * If true it means we want to insert a new extent after dropping all + * the extents in the range. If this is true, the 'extent_item_size' + * parameter must be set as well and the 'extent_inserted' field will + * be set to true by btrfs_drop_extents() if it could insert the new + * extent. + * Note: when this is set to true the path must not be NULL. + */ + bool replace_extent; + /* + * Used if 'replace_extent' is true. Size of the file extent item to + * insert after dropping all existing extents in the range + */ + u32 extent_item_size; + + /* Output parameters */ + + /* + * Set to the minimum between the input parameter 'end' and the end + * (exclusive, last byte + 1) of the last dropped extent. This is always + * set even if btrfs_drop_extents() returns an error. + */ + u64 drop_end; + /* + * Only set if 'replace_extent' is true. Set to true if we were able + * to insert a replacement extent after dropping all extents in the + * range, otherwise set to false by btrfs_drop_extents(). + * Also, if btrfs_drop_extents() has set this to true it means it + * returned with the path locked, otherwise if it has set this to + * false it has returned with the path released. + */ + bool extent_inserted; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -3119,16 +3171,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache); + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97b5b183272f..1648b6bfa2e7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -678,13 +678,9 @@ next: * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; @@ -694,12 +690,12 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_key new_key; struct inode *vfs_inode = &inode->vfs_inode; u64 ino = btrfs_ino(inode); - u64 search_start = start; + u64 search_start = args->start; u64 disk_bytenr = 0; u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; - u64 last_end = start; + u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; @@ -709,11 +705,25 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int update_refs; int found = 0; int leafs_visited = 0; + struct btrfs_path *path = args->path; + + args->extent_inserted = false; + + /* Must always have a path if ->replace_extent is true */ + ASSERT(!(args->replace_extent && !args->path)); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + } - if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); + if (args->drop_cache) + btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); - if (start >= inode->disk_i_size && !replace_extent) + if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || @@ -724,7 +734,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, search_start, modify_tree); if (ret < 0) break; - if (ret > 0 && path->slots[0] > 0 && search_start == start) { + if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && @@ -759,7 +769,7 @@ next_slot: path->slots[0]++; goto next_slot; } - if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -801,7 +811,7 @@ next_slot: } found = 1; - search_start = max(key.offset, start); + search_start = max(key.offset, args->start); if (recow || !modify_tree) { modify_tree = -1; btrfs_release_path(path); @@ -812,7 +822,7 @@ next_slot: * | - range to drop - | * | -------- extent -------- | */ - if (start > key.offset && end < extent_end) { + if (args->start > key.offset && args->end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -820,7 +830,7 @@ next_slot: } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = start; + new_key.offset = args->start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { @@ -834,15 +844,15 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_offset += start - key.offset; + extent_offset += args->start - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - start); + extent_end - args->start); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { @@ -852,11 +862,11 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - start - extent_offset); + args->start - extent_offset); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } - key.offset = start; + key.offset = args->start; } /* * From here on out we will have actually dropped something, so @@ -868,23 +878,24 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start <= key.offset && end < extent_end) { + if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = end; + new_key.offset = args->end; btrfs_set_item_key_safe(fs_info, path, &new_key); - extent_offset += end - key.offset; + extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - end); + extent_end - args->end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, end - key.offset); + inode_sub_bytes(vfs_inode, + args->end - key.offset); break; } @@ -893,7 +904,7 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start > key.offset && end >= extent_end) { + if (args->start > key.offset && args->end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -901,11 +912,12 @@ next_slot: } btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, extent_end - start); - if (end == extent_end) + inode_sub_bytes(vfs_inode, + extent_end - args->start); + if (args->end == extent_end) break; path->slots[0]++; @@ -916,7 +928,7 @@ next_slot: * | ---- range to drop ----- | * | ------ extent ------ | */ - if (start <= key.offset && end >= extent_end) { + if (args->start <= key.offset && args->end >= extent_end) { delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; @@ -946,7 +958,7 @@ delete_extent_item: extent_end - key.offset); } - if (end == extent_end) + if (args->end == extent_end) break; if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { @@ -976,7 +988,7 @@ delete_extent_item: * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted - * path->slots[0] for our insertion (if replace_extent != 0). + * path->slots[0] for our insertion (if args->replace_extent). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); @@ -990,14 +1002,14 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && leafs_visited == 1 && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= - sizeof(struct btrfs_item) + extent_item_size) { + sizeof(struct btrfs_item) + args->extent_item_size) { key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = args->start; if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { struct btrfs_key slot_key; @@ -1005,30 +1017,18 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, &extent_item_size, 1); - *key_inserted = 1; + setup_items_for_insert(root, path, &key, + &args->extent_item_size, 1); + args->extent_inserted = true; } - if (!replace_extent || !(*key_inserted)) + if (!args->path) + btrfs_free_path(path); + else if (!args->extent_inserted) btrfs_release_path(path); - if (drop_end) - *drop_end = found ? min(end, last_end) : end; - return ret; -} - -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache) -{ - struct btrfs_path *path; - int ret; +out: + args->drop_end = found ? min(args->end, last_end) : args->end; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, - end, NULL, drop_cache, 0, 0, NULL); - btrfs_free_path(path); return ret; } @@ -2607,6 +2607,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); @@ -2615,7 +2616,6 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_block_rsv *rsv; unsigned int rsv_count; u64 cur_offset; - u64 drop_end; u64 len = end - start; int ret = 0; @@ -2654,10 +2654,12 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = rsv; cur_offset = start; + drop_args.path = path; + drop_args.end = end + 1; + drop_args.drop_cache = true; while (cur_offset < end) { - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, - cur_offset, end + 1, &drop_end, - 1, 0, 0, NULL); + drop_args.start = cur_offset; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret != -ENOSPC) { /* * When cloning we want to avoid transaction aborts when @@ -2674,10 +2676,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = &fs_info->trans_block_rsv; - if (!extent_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2688,7 +2690,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2696,7 +2698,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * file extent is inserted here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, + drop_args.drop_end - cur_offset); if (ret) { /* * We couldn't clear our area, so we could @@ -2708,8 +2711,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } - if (extent_info && drop_end > extent_info->file_offset) { - u64 replace_len = drop_end - extent_info->file_offset; + if (extent_info && + drop_args.drop_end > extent_info->file_offset) { + u64 replace_len = drop_args.drop_end - + extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len); @@ -2722,7 +2727,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_end; + cur_offset = drop_args.drop_end; ret = btrfs_update_inode(trans, root, inode); if (ret) @@ -2781,25 +2786,26 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * will not record the existence of the hole region * [existing_hole_start, lockend]. */ - if (drop_end <= end) - drop_end = end + 1; + if (drop_args.drop_end <= end) + drop_args.drop_end = end + 1; /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && + cur_offset < drop_args.drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2ef8308ad0a..25764de68b92 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -202,7 +202,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, int extent_inserted, + struct btrfs_path *path, bool extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, @@ -316,6 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, int compress_type, struct page **compressed_pages) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -326,8 +327,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 data_len = inline_len; int ret; struct btrfs_path *path; - int extent_inserted = 0; - u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -353,16 +352,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } trans->block_rsv = &inode->block_rsv; + drop_args.path = path; + drop_args.start = start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + drop_args.replace_extent = true; + if (compressed_size && compressed_pages) - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( compressed_size); else - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, - NULL, 1, 1, extent_item_size, - &extent_inserted); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -370,7 +373,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, extent_inserted, + ret = insert_inline_extent(trans, path, drop_args.extent_inserted, root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); @@ -2568,7 +2571,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); - int extent_inserted = 0; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); @@ -2584,13 +2587,16 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, - file_pos + num_bytes, NULL, 0, - 1, sizeof(*stack_fi), &extent_inserted); + drop_args.path = path; + drop_args.start = file_pos; + drop_args.end = file_pos + num_bytes; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*stack_fi); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; @@ -4748,6 +4754,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* @@ -4770,7 +4777,11 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + drop_args.start = offset; + drop_args.end = offset + len; + drop_args.drop_cache = true; + + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5d0bb7c3dc33..67728ea3ed47 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -163,6 +163,7 @@ static int clone_copy_inline_extent(struct inode *dst, const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; @@ -252,7 +253,11 @@ copy_inline_extent: trans = NULL; goto out; } - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + drop_args.path = path; + drop_args.start = drop_start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b1f97be57bb3..89ff063cae24 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -576,6 +576,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; @@ -653,7 +654,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); + drop_args.start = start; + drop_args.end = extent_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret) goto out; @@ -2576,6 +2580,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct inode *inode; u64 from; @@ -2586,8 +2591,12 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } from = ALIGN(i_size_read(inode), root->fs_info->sectorsize); - ret = btrfs_drop_extents(wc->trans, root, inode, - from, (u64)-1, 1); + drop_args.start = from; + drop_args.end = (u64)-1; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(wc->trans, root, + BTRFS_I(inode), + &drop_args); if (!ret) { /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, @@ -4185,6 +4194,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -4193,19 +4203,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int extent_inserted = 0; ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; -- cgit v1.2.3 From 7f458a3873ae94efe1f37c8b96c97e7298769e98 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:33 +0000 Subject: btrfs: fix race when defragmenting leads to unnecessary IO When defragmenting we skip ranges that have holes or inline extents, so that we don't do unnecessary IO and waste space. We do this check when calling should_defrag_range() at btrfs_defrag_file(). However we do it without holding the inode's lock. The reason we do it like this is to avoid blocking other tasks for too long, that possibly want to operate on other file ranges, since after the call to should_defrag_range() and before locking the inode, we trigger a synchronous page cache readahead. However before we were able to lock the inode, some other task might have punched a hole in our range, or we may now have an inline extent there, in which case we should not set the range for defrag anymore since that would cause unnecessary IO and make us waste space (i.e. allocating extents to contain zeros for a hole). So after we locked the inode and the range in the iotree, check again if we have holes or an inline extent, and if we do, just skip the range. I hit this while testing my next patch that fixes races when updating an inode's number of bytes (subject "btrfs: update the number of bytes used by an inode atomically"), and it depends on this change in order to work correctly. Alternatively I could rework that other patch to detect holes and flag their range with the 'new delalloc' bit, but this itself fixes an efficiency problem due a race that from a functional point of view is not harmful (it could be triggered with btrfs/062 from fstests). CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ea40a19cc4cb..2904f92c3813 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1275,6 +1275,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_end; u64 page_cnt; u64 start = (u64)start_index << PAGE_SHIFT; + u64 search_start; int ret; int i; int i_done; @@ -1371,6 +1372,40 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); + + /* + * When defragmenting we skip ranges that have holes or inline extents, + * (check should_defrag_range()), to avoid unnecessary IO and wasting + * space. At btrfs_defrag_file(), we check if a range should be defragged + * before locking the inode and then, if it should, we trigger a sync + * page cache readahead - we lock the inode only after that to avoid + * blocking for too long other tasks that possibly want to operate on + * other file ranges. But before we were able to get the inode lock, + * some other task may have punched a hole in the range, or we may have + * now an inline extent, in which case we should not defrag. So check + * for that here, where we have the inode and the range locked, and bail + * out if that happened. + */ + search_start = page_start; + while (search_start < page_end) { + struct extent_map *em; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, + page_end - search_start); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_range; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + /* Ok, 0 means we did not defrag anything */ + ret = 0; + goto out_unlock_range; + } + search_start = extent_map_end(em); + free_extent_map(em); + } + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); @@ -1401,6 +1436,10 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; + +out_unlock_range: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state); out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); -- cgit v1.2.3 From 2766ff61762c3fa19bf30bc0ff72ea5306229f09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:34 +0000 Subject: btrfs: update the number of bytes used by an inode atomically There are several occasions where we do not update the inode's number of used bytes atomically, resulting in a concurrent stat(2) syscall to report a value of used blocks that does not correspond to a valid value, that is, a value that does not match neither what we had before the operation nor what we get after the operation completes. In extreme cases it can result in stat(2) reporting zero used blocks, which can cause problems for some userspace tools where they can consider a file with a non-zero size and zero used blocks as completely sparse and skip reading data, as reported/discussed a long time ago in some threads like the following: https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html The cases where this can happen are the following: -> Case 1 If we do a write (buffered or direct IO) against a file region for which there is already an allocated extent (or multiple extents), then we have a short time window where we can report a number of used blocks to stat(2) that does not take into account the file region being overwritten. This short time window happens when completing the ordered extent(s). This happens because when we drop the extents in the write range we decrement the inode's number of bytes and later on when we insert the new extent(s) we increment the number of bytes in the inode, resulting in a short time window where a stat(2) syscall can get an incorrect number of used blocks. If we do writes that overwrite an entire file, then we have a short time window where we report 0 used blocks to stat(2). Example reproducer: $ cat reproducer-1.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi stat_loop() { trap "wait; exit" SIGTERM local filepath=$1 local expected=$2 local got while :; do got=$(stat -c %b $filepath) if [ $got -ne $expected ]; then echo -n "ERROR: unexpected used blocks" echo " (got: $got expected: $expected)" fi done } mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f $DEV > /dev/null # mkfs.ext4 -F $DEV > /dev/null # mkfs.f2fs -f $DEV > /dev/null # mkfs.reiserfs -f $DEV > /dev/null mount $DEV $MNT xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null expected=$(stat -c %b $MNT/foobar) # Create a process to keep calling stat(2) on the file and see if the # reported number of blocks used (disk space used) changes, it should # not because we are not increasing the file size nor punching holes. stat_loop $MNT/foobar $expected & loop_pid=$! for ((i = 0; i < 50000; i++)); do xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null done kill $loop_pid &> /dev/null wait umount $DEV $ ./reproducer-1.sh ERROR: unexpected used blocks (got: 0 expected: 128) ERROR: unexpected used blocks (got: 0 expected: 128) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. -> Case 2 If we do a buffered write against a file region that does not have any allocated extents, like a hole or beyond EOF, then during ordered extent completion we have a short time window where a concurrent stat(2) syscall can report a number of used blocks that does not correspond to the value before or after the write operation, a value that is actually larger than the value after the write completes. This happens because once we start a buffered write into an unallocated file range we increment the inode's 'new_delalloc_bytes', to make sure any stat(2) call gets a correct used blocks value before delalloc is flushed and completes. However at ordered extent completion, after we inserted the new extent, we increment the inode's number of bytes used with the size of the new extent, and only later, when clearing the range in the inode's iotree, we decrement the inode's 'new_delalloc_bytes' counter with the size of the extent. So this results in a short time window where a concurrent stat(2) syscall can report a number of used blocks that accounts for the new extent twice. Example reproducer: $ cat reproducer-2.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi stat_loop() { trap "wait; exit" SIGTERM local filepath=$1 local expected=$2 local got while :; do got=$(stat -c %b $filepath) if [ $got -ne $expected ]; then echo -n "ERROR: unexpected used blocks" echo " (got: $got expected: $expected)" fi done } mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f $DEV > /dev/null # mkfs.ext4 -F $DEV > /dev/null # mkfs.f2fs -f $DEV > /dev/null # mkfs.reiserfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foobar write_size=$((64 * 1024)) for ((i = 0; i < 16384; i++)); do offset=$(($i * $write_size)) xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null blocks_used=$(stat -c %b $MNT/foobar) # Fsync the file to trigger writeback and keep calling stat(2) on it # to see if the number of blocks used changes. stat_loop $MNT/foobar $blocks_used & loop_pid=$! xfs_io -c "fsync" $MNT/foobar kill $loop_pid &> /dev/null wait $loop_pid done umount $DEV $ ./reproducer-2.sh ERROR: unexpected used blocks (got: 265472 expected: 265344) ERROR: unexpected used blocks (got: 284032 expected: 283904) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. -> Case 3 Another case where such problems happen is during other operations that replace extents in a file range with other extents. Those operations are extent cloning, deduplication and fallocate's zero range operation. The cause of the problem is similar to the first case. When we drop the extents from a range, we decrement the inode's number of bytes, and later on, after inserting the new extents we increment it. Since this is not done atomically, a concurrent stat(2) call can see and return a number of used blocks that is smaller than it should be, does not match the number of used blocks before or after the clone/deduplication/zero operation. Like for the first case, when doing a clone, deduplication or zero range operation against an entire file, we end up having a time window where we can report 0 used blocks to a stat(2) call. Example reproducer: $ cat reproducer-3.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f -m reflink=1 $DEV > /dev/null mount $DEV $MNT extent_size=$((64 * 1024)) num_extents=16384 file_size=$(($extent_size * $num_extents)) # File foo has many small extents. xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \ > /dev/null # File bar has much less extents and has exactly the same data as foo. xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null expected=$(stat -c %b $MNT/foo) # Now deduplicate bar into foo. While the deduplication is in progres, # the number of used blocks/file size reported by stat should not change xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null & dedupe_pid=$! while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do used=$(stat -c %b $MNT/foo) if [ $used -ne $expected ]; then echo "Unexpected blocks used: $used (expected: $expected)" fi done umount $DEV $ ./reproducer-3.sh Unexpected blocks used: 2076800 (expected: 2097152) Unexpected blocks used: 2097024 (expected: 2097152) Unexpected blocks used: 2079872 (expected: 2097152) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. So fix this by: 1) Making btrfs_drop_extents() not decrement the VFS inode's number of bytes, and instead return the number of bytes; 2) Making any code that drops extents and adds new extents update the inode's number of bytes atomically, while holding the btrfs inode's spinlock, which is also used by the stat(2) callback to get the inode's number of bytes; 3) For ranges in the inode's iotree that are marked as 'delalloc new', corresponding to previously unallocated ranges, increment the inode's number of bytes when clearing the 'delalloc new' bit from the range, in the same critical section that decrements the inode's 'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock. An alternative would be to have btrfs_getattr() wait for any IO (ordered extents in progress) and locking the whole range (0 to (u64)-1) while it it computes the number of blocks used. But that would mean blocking stat(2), which is a very used syscall and expected to be fast, waiting for writes, clone/dedupe, fallocate, page reads, fiemap, etc. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +- fs/btrfs/ctree.h | 8 +++ fs/btrfs/extent-io-tree.h | 16 +++++- fs/btrfs/extent_io.c | 10 ++-- fs/btrfs/file.c | 43 +++++++++----- fs/btrfs/inode.c | 140 +++++++++++++++++++++++++++++++++++++--------- fs/btrfs/reflink.c | 2 +- fs/btrfs/tree-log.c | 4 +- 8 files changed, 176 insertions(+), 50 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8494f62f8aa4..b4c09a12659c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -50,7 +50,8 @@ struct btrfs_inode { /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans). + * logged_trans), to access/update new_delalloc_bytes and to update the + * VFS' inode number of bytes used. */ spinlock_t lock; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3071b0eccd82..228c5df4b17f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1274,6 +1274,11 @@ struct btrfs_drop_extents_args { * set even if btrfs_drop_extents() returns an error. */ u64 drop_end; + /* + * The number of allocated bytes found in the range. This can be smaller + * than the range's length when there are holes in the range. + */ + u64 bytes_found; /* * Only set if 'replace_extent' is true. Set to true if we were able * to insert a replacement extent after dropping all extents in the @@ -3142,6 +3147,9 @@ extern const struct iomap_dio_ops btrfs_dio_ops; int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index cab4273ff8d3..5334b3772f18 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -21,10 +21,24 @@ struct io_failure_record; #define EXTENT_NORESERVE (1U << 11) #define EXTENT_QGROUP_RESERVED (1U << 12) #define EXTENT_CLEAR_DATA_RESV (1U << 13) +/* + * Must be cleared only during ordered extent completion or on error paths if we + * did not manage to submit bios and create the ordered extents for the range. + * Should not be cleared during page release and page invalidation (if there is + * an ordered extent in flight), that is left for the ordered extent completion. + */ #define EXTENT_DELALLOC_NEW (1U << 14) +/* + * When an ordered extent successfully completes for a region marked as a new + * delalloc range, use this flag when clearing a new delalloc range to indicate + * that the VFS' inode number of bytes should be incremented and the inode's new + * delalloc bytes decremented, in an atomic way to prevent races with stat(2). + */ +#define EXTENT_ADD_INODE_BYTES (1U << 15) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ + EXTENT_ADD_INODE_BYTES) /* * Redefined bits above which are used only in the device allocation tree, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ecca6d6ec90a..3bbb3bdd395b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4423,12 +4423,14 @@ static int try_release_extent_state(struct extent_io_tree *tree, ret = 0; } else { /* - * at this point we can safely clear everything except the - * locked bit and the nodatasum bit + * At this point we can safely clear everything except the + * locked bit, the nodatasum bit and the delalloc new bit. + * The delalloc new bit will be cleared by ordered extent + * completion. */ ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask, NULL); + ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1648b6bfa2e7..8a9056b6e2ad 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -677,6 +677,12 @@ next: * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. + * + * Note: the VFS' inode number of bytes is not updated, it's up to the caller + * to deal with that. We set the field 'bytes_found' of the arguments structure + * with the number of allocated bytes found in the target range, so that the + * caller can update the inode's number of bytes in an atomic way when + * replacing extents in a range to avoid races with stat(2). */ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -688,7 +694,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; - struct inode *vfs_inode = &inode->vfs_inode; u64 ino = btrfs_ino(inode); u64 search_start = args->start; u64 disk_bytenr = 0; @@ -707,6 +712,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; struct btrfs_path *path = args->path; + args->bytes_found = 0; args->extent_inserted = false; /* Must always have a path if ->replace_extent is true */ @@ -894,8 +900,7 @@ next_slot: extent_end - args->end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, - args->end - key.offset); + args->bytes_found += args->end - key.offset; break; } @@ -915,8 +920,7 @@ next_slot: args->start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, - extent_end - args->start); + args->bytes_found += extent_end - args->start; if (args->end == extent_end) break; @@ -940,8 +944,7 @@ delete_extent_item: if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { @@ -954,8 +957,7 @@ delete_extent_item: key.offset - extent_offset); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; } if (args->end == extent_end) @@ -2517,7 +2519,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, - const u64 replace_len) + const u64 replace_len, + const u64 bytes_to_drop) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2532,8 +2535,10 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, return 0; if (extent_info->disk_offset == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) + btrfs_fs_incompat(fs_info, NO_HOLES)) { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); return 0; + } key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; @@ -2562,10 +2567,12 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, return ret; /* If it's a hole, nothing more needs to be done. */ - if (extent_info->disk_offset == 0) + if (extent_info->disk_offset == 0) { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); return 0; + } - inode_add_bytes(inode, replace_len); + btrfs_update_inode_bytes(BTRFS_I(inode), replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; @@ -2660,6 +2667,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, while (cur_offset < end) { drop_args.start = cur_offset; ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + /* If we are punching a hole decrement the inode's byte count */ + if (!extent_info) + btrfs_update_inode_bytes(BTRFS_I(inode), 0, + drop_args.bytes_found); if (ret != -ENOSPC) { /* * When cloning we want to avoid transaction aborts when @@ -2717,7 +2728,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, - extent_info, replace_len); + extent_info, replace_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -2814,7 +2826,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, - extent_info->data_len); + extent_info->data_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 25764de68b92..2db11ab4ecbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -223,8 +223,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, if (compressed_size && compressed_pages) cur_size = compressed_size; - inode_add_bytes(inode, size); - if (!extent_inserted) { struct btrfs_key key; size_t datasize; @@ -299,8 +297,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - ret = btrfs_update_inode(trans, root, inode); - fail: return ret; } @@ -385,6 +381,16 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } + btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: @@ -2144,6 +2150,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; + if (*bits & EXTENT_ADD_INODE_BYTES) + inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } @@ -2561,9 +2569,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, + const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; + const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; @@ -2615,7 +2625,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(&inode->vfs_inode, num_bytes); + /* + * If we dropped an inline extent here, we know the range where it is + * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the + * number of bytes only for that range contaning the inline extent. + * The remaining of the range will be processed when clearning the + * EXTENT_DELALLOC_BIT bit through the ordered extent completion. + */ + if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { + u64 inline_size = round_down(drop_args.bytes_found, sectorsize); + + inline_size = drop_args.bytes_found - inline_size; + btrfs_update_inode_bytes(inode, sectorsize, inline_size); + drop_args.bytes_found -= inline_size; + num_bytes -= sectorsize; + } + + if (update_inode_bytes) + btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -2653,6 +2680,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_file_extent_item stack_fi; u64 logical_len; + bool update_inode_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -2668,9 +2696,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ + /* + * For delalloc, when completing an ordered extent we update the inode's + * bytes when clearing the range in the inode's io tree, so pass false + * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), + * except if the ordered extent was truncated. + */ + update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), oe->file_offset, &stack_fi, - oe->qgroup_rsv); + update_inode_bytes, oe->qgroup_rsv); } /* @@ -2692,10 +2729,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; - bool range_locked = false; - bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; - unsigned int clear_bits; + unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; @@ -2703,7 +2738,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) - clear_new_delalloc_bytes = true; + clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2742,7 +2777,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - range_locked = true; + clear_bits |= EXTENT_LOCKED; lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) @@ -2789,6 +2824,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If this is a new delalloc range, clear its new delalloc flag to + * update the inode's number of bytes. This needs to be done first + * before updating the inode item. + */ + if ((clear_bits & EXTENT_DELALLOC_NEW) && + !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + 0, 0, &cached_state); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ @@ -2797,11 +2843,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - clear_bits = EXTENT_DEFRAG; - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); @@ -4790,10 +4831,12 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); - else + } else { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, drop_args.bytes_found); btrfs_update_inode(trans, root, inode); + } btrfs_end_transaction(trans); return ret; } @@ -8117,6 +8160,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, u64 start; u64 end; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + bool found_ordered = false; + bool completed_ordered = false; /* * we have the page locked, so new writeback can't start, @@ -8138,15 +8183,17 @@ again: start = page_start; ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { + found_ordered = true; end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); /* - * IO on this page will never be started, so we need - * to account for any ordered extents now + * IO on this page will never be started, so we need to account + * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW + * here, must leave that up for the ordered extent completion. */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8168,8 +8215,10 @@ again: if (btrfs_dec_test_ordered_pending(inode, &ordered, start, - end - start + 1, 1)) + end - start + 1, 1)) { btrfs_finish_ordered_io(ordered); + completed_ordered = true; + } } btrfs_put_ordered_extent(ordered); if (!inode_evicting) { @@ -8198,10 +8247,23 @@ again: */ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { + bool delete = true; + + /* + * If there's an ordered extent for this range and we have not + * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set + * in the range for the ordered extent completion. We must also + * not delete the range, otherwise we would lose that bit (and + * any other bits set in the range). Make sure EXTENT_UPTODATE + * is cleared if we don't delete, otherwise it can lead to + * corruptions if the i_size is extented later. + */ + if (found_ordered && !completed_ordered) + delete = false; clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state); + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete, &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -8750,6 +8812,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; + u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; @@ -8776,8 +8839,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; + inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); - stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -9586,7 +9650,8 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (trans) { ret = insert_reserved_file_extent(trans, BTRFS_I(inode), - file_offset, &stack_fi, ret); + file_offset, &stack_fi, + true, ret); if (ret) return ERR_PTR(ret); return trans; @@ -10202,6 +10267,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } #endif +/* + * Update the number of bytes used in the VFS' inode. When we replace extents in + * a range (clone, dedupe, fallocate's zero range), we must update the number of + * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls + * always get a correct value. + */ +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes) +{ + if (add_bytes == del_bytes) + return; + + spin_lock(&inode->lock); + if (del_bytes > 0) + inode_sub_bytes(&inode->vfs_inode, del_bytes); + if (add_bytes > 0) + inode_add_bytes(&inode->vfs_inode, add_bytes); + spin_unlock(&inode->lock); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 67728ea3ed47..4bbc5f52b752 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -268,7 +268,7 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - inode_add_bytes(dst, datal); + btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 89ff063cae24..932a74a236eb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -832,8 +832,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret) goto out; - inode_add_bytes(inode, nbytes); update_inode: + btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); out: if (inode) @@ -2598,6 +2598,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, BTRFS_I(inode), &drop_args); if (!ret) { + inode_sub_bytes(inode, + drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); -- cgit v1.2.3 From bacce86ae8a7b8b3c7d8398eb57d151a808043d1 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 6 Nov 2020 16:06:33 +0800 Subject: btrfs: drop unused argument step from btrfs_free_extra_devids Commit cf89af146b7e ("btrfs: dev-replace: fail mount if we don't have replace item with target device") dropped the multi stage operation of btrfs_free_extra_devids() that does not need to check replace target anymore and we can remove the 'step' argument. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 ++++++------ fs/btrfs/volumes.c | 8 ++++---- fs/btrfs/volumes.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f0161d2ae2a4..8c87a1caefff 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3149,11 +3149,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Keep the devid that is marked to be the target device for the - * device replace procedure + * At this point we know all the devices that make this filesystem, + * including the seed devices but we don't know yet if the replace + * target is required. So free devices that are not part of this + * filesystem but skip the replace traget device which is checked + * below in btrfs_init_dev_replace(). */ - btrfs_free_extra_devids(fs_devices, 0); - + btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; @@ -3200,8 +3202,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_block_groups; } - btrfs_free_extra_devids(fs_devices, 1); - ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 998f25cddf2b..d7cc74f59293 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1044,7 +1044,7 @@ error: } static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, - int step, struct btrfs_device **latest_dev) + struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; @@ -1089,16 +1089,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, * After we have read the system tree and know devids belonging to this * filesystem, remove the device which does not belong there. */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *latest_dev = NULL; struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); - __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + __btrfs_free_extra_devids(fs_devices, &latest_dev); list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) - __btrfs_free_extra_devids(seed_dev, step, &latest_dev); + __btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 34fd440fd30b..7d1faf1c4ce4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -449,7 +449,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 3a160a933111241376799244e3587747af574b89 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Nov 2020 13:49:42 +0800 Subject: btrfs: drop never met disk total bytes check in verify_one_dev_extent Drop the condition in verify_one_dev_extent, btrfs_device::disk_total_bytes is set even for a seed device. The comment is wrong, the size is properly set when cloning the device. Commit 1b3922a8bc74 ("btrfs: Use real device structure to verify dev extent") introduced it but it's unclear why the total_disk_bytes was 0. Theoretically, all devices (including missing and seed) marked with the BTRFS_DEV_STATE_IN_FS_METADATA flag gets the total_disk_bytes updated at fill_device_from_item(): open_ctree() btrfs_read_chunk_tree() read_one_dev() open_seed_device() fill_device_from_item() Even if verify_one_dev_extent() reports total_disk_bytes == 0, then its a bug to be fixed somewhere else and not in verify_one_dev_extent() as it's just a messenger. It is never expected that a total_disk_bytes shall be zero. The function fill_device_from_item() does the job of reading it from the item and updating btrfs_device::disk_total_bytes. So both the missing device and the seed devices do have their disk_total_bytes updated. btrfs_find_device can also return a device from fs_info->seed_list because it searches it as well. Furthermore, while removing the device if there is a power loss, we could have a device with its total_bytes = 0, that's still valid. Instead, introduce a check against maximum block device size in read_one_dev(). Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d7cc74f59293..cd72bdc7455e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6869,6 +6869,16 @@ static int read_one_dev(struct extent_buffer *leaf, } fill_device_from_item(leaf, dev_item, device); + if (device->bdev) { + u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + + if (device->total_bytes > max_total_bytes) { + btrfs_err(fs_info, + "device total_bytes should be at most %llu but found %llu", + max_total_bytes, device->total_bytes); + return -EINVAL; + } + } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -7598,21 +7608,6 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } - /* It's possible this device is a dummy for seed device */ - if (dev->disk_total_bytes == 0) { - struct btrfs_fs_devices *devs; - - devs = list_first_entry(&fs_info->fs_devices->seed_list, - struct btrfs_fs_devices, seed_list); - dev = btrfs_find_device(devs, devid, NULL, NULL, false); - if (!dev) { - btrfs_err(fs_info, "failed to find seed devid %llu", - devid); - ret = -EUCLEAN; - goto out; - } - } - if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", -- cgit v1.2.3 From b2598edf8b36f8b7c52e3f5f611c49cbd1c67b36 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Nov 2020 13:49:43 +0800 Subject: btrfs: remove unused argument seed from btrfs_find_device Commit 343694eee8d8 ("btrfs: switch seed device to list api"), missed to check if the parameter seed is true in the function btrfs_find_device(). This tells it whether to traverse the seed device list or not. After this commit, the argument is unused and can be removed. In device_list_add() it's not necessary because fs_devices always points to the device's fs_devices. So with the devid+uuid matching, it will find the right device and return, thus not needing to traverse seed devices. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 ++++---- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/volumes.c | 22 ++++++++++------------ fs/btrfs/volumes.h | 2 +- 5 files changed, 19 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 10638537b9ef..dc3481014f16 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -96,7 +96,7 @@ no_valid_dev_replace_entry_found: * a replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -159,7 +159,7 @@ no_valid_dev_replace_entry_found: * replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -171,10 +171,10 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL, true); + src_devid, NULL, NULL); dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, - NULL, NULL, true); + NULL, NULL); /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2904f92c3813..e7ccba83e8a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1717,7 +1717,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -3360,7 +3360,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, rcu_read_lock(); dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL, true); + NULL); if (!dev) { ret = -ENODEV; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 57ee06d92150..74acb7ca6eda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3885,7 +3885,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4062,7 +4062,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (dev) sctx = dev->scrub_ctx; if (sctx) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd72bdc7455e..7132af058a95 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -822,7 +822,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } else { mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL, false); + disk_super->dev_item.uuid, NULL); /* * If this disk has been pulled into an fs devices created by @@ -2294,10 +2294,10 @@ static struct btrfs_device *btrfs_find_device_by_path( dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid, true); + disk_super->metadata_uuid); else device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid, true); + disk_super->fsid); btrfs_release_disk_super(disk_super); if (!device) @@ -2317,7 +2317,7 @@ struct btrfs_device *btrfs_find_device_by_devspec( if (devid) { device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL, true); + NULL); if (!device) return ERR_PTR(-ENOENT); return device; @@ -2466,7 +2466,7 @@ next_slot: read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -6459,8 +6459,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, - bool seed) + u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; @@ -6667,7 +6666,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL, true); + devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -6806,7 +6805,7 @@ static int read_one_dev(struct extent_buffer *leaf, } device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7469,8 +7468,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, - true); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7601,7 +7599,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7d1faf1c4ce4..5e08274d1252 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -466,7 +466,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, bool seed); + u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From ffeb03cfe2b49b73da7b325a31714003761fc6d5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:29 -0500 Subject: btrfs: cleanup the locking in btrfs_next_old_leaf We are carrying around this next_rw_lock from when we would do spinning vs blocking read locks. Now that we have the rwsem locking we can simply use the read lock flag unconditionally and the read lock helpers. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e2673c29e733..ec904db041e6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5272,7 +5272,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key key; u32 nritems; int ret; - int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -5282,7 +5281,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, again: level = 1; next = NULL; - next_rw_lock = 0; btrfs_release_path(path); path->keep_locks = 1; @@ -5346,12 +5344,11 @@ again: } if (next) { - btrfs_tree_unlock_rw(next, next_rw_lock); + btrfs_tree_read_unlock(next); free_extent_buffer(next); } next = c; - next_rw_lock = path->locks[level]; ret = read_block_for_search(root, path, &next, level, slot, &key); if (ret == -EAGAIN) @@ -5382,7 +5379,6 @@ again: BTRFS_NESTING_RIGHT, path->recurse); } - next_rw_lock = BTRFS_READ_LOCK; } break; } @@ -5391,13 +5387,13 @@ again: level--; c = path->nodes[level]; if (path->locks[level]) - btrfs_tree_unlock_rw(c, path->locks[level]); + btrfs_tree_read_unlock(c); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = next_rw_lock; + path->locks[level] = BTRFS_READ_LOCK; if (!level) break; @@ -5411,11 +5407,9 @@ again: goto done; } - if (!path->skip_locking) { + if (!path->skip_locking) __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, path->recurse); - next_rw_lock = BTRFS_READ_LOCK; - } } ret = 0; done: -- cgit v1.2.3 From 0e46318df8a120ba5f1e15210c32cfab33b09f40 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:30 -0500 Subject: btrfs: unlock to current level in btrfs_next_old_leaf Filipe reported the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.10.0-rc2-btrfs-next-71 #1 Not tainted ------------------------------------------------------ find/324157 is trying to acquire lock: ffff8ebc48d293a0 (btrfs-tree-01#2/3){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] but task is already holding lock: ffff8eb9932c5088 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-tree-00){++++}-{3:3}: lock_acquire+0xd8/0x490 down_write_nested+0x44/0x120 __btrfs_tree_lock+0x27/0x120 [btrfs] btrfs_search_slot+0x2a3/0xc50 [btrfs] btrfs_insert_empty_items+0x58/0xa0 [btrfs] insert_with_overflow+0x44/0x110 [btrfs] btrfs_insert_xattr_item+0xb8/0x1d0 [btrfs] btrfs_setxattr+0xd6/0x4c0 [btrfs] btrfs_setxattr_trans+0x68/0x100 [btrfs] __vfs_setxattr+0x66/0x80 __vfs_setxattr_noperm+0x70/0x200 vfs_setxattr+0x6b/0x120 setxattr+0x125/0x240 path_setxattr+0xba/0xd0 __x64_sys_setxattr+0x27/0x30 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (btrfs-tree-01#2/3){++++}-{3:3}: check_prev_add+0x91/0xc60 __lock_acquire+0x1689/0x3130 lock_acquire+0xd8/0x490 down_read_nested+0x45/0x220 __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] btrfs_next_old_leaf+0x27d/0x580 [btrfs] btrfs_real_readdir+0x1e3/0x4b0 [btrfs] iterate_dir+0x170/0x1c0 __x64_sys_getdents64+0x83/0x140 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-tree-00); lock(btrfs-tree-01#2/3); lock(btrfs-tree-00); lock(btrfs-tree-01#2/3); *** DEADLOCK *** 5 locks held by find/324157: #0: ffff8ebc502c6e00 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60 #1: ffff8eb97f689980 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: iterate_dir+0x52/0x1c0 #2: ffff8ebaec00ca58 (btrfs-tree-02#2){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] #3: ffff8eb98f986f78 (btrfs-tree-01#2){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] #4: ffff8eb9932c5088 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] stack backtrace: CPU: 2 PID: 324157 Comm: find Not tainted 5.10.0-rc2-btrfs-next-71 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 check_noncircular+0xff/0x110 ? mark_lock.part.0+0x468/0xe90 check_prev_add+0x91/0xc60 __lock_acquire+0x1689/0x3130 ? kvm_clock_read+0x14/0x30 ? kvm_sched_clock_read+0x5/0x10 lock_acquire+0xd8/0x490 ? __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] down_read_nested+0x45/0x220 ? __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] __btrfs_tree_read_lock+0x32/0x1a0 [btrfs] btrfs_next_old_leaf+0x27d/0x580 [btrfs] btrfs_real_readdir+0x1e3/0x4b0 [btrfs] iterate_dir+0x170/0x1c0 __x64_sys_getdents64+0x83/0x140 ? filldir+0x1d0/0x1d0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happens because btrfs_next_old_leaf searches down to our current key, and then walks up the path until we can move to the next slot, and then reads back down the path so we get the next leaf. However it doesn't unlock any lower levels until it replaces them with the new extent buffer. This is technically fine, but of course causes lockdep to complain, because we could be holding locks on lower levels while locking upper levels. Fix this by dropping all nodes below the level that we use as our new starting point before we start reading back down the path. This also allows us to drop the nested/recursive locking magic, because we're no longer locking two nodes at the same level anymore. Reported-by: Filipe Manana Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec904db041e6..7dc3e87828bd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5272,6 +5272,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key key; u32 nritems; int ret; + int i; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -5343,9 +5344,19 @@ again: continue; } - if (next) { - btrfs_tree_read_unlock(next); - free_extent_buffer(next); + + /* + * Our current level is where we're going to start from, and to + * make sure lockdep doesn't complain we need to drop our locks + * and nodes from 0 to our current level. + */ + for (i = 0; i < level; i++) { + if (path->locks[level]) { + btrfs_tree_read_unlock(path->nodes[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; } next = c; @@ -5374,22 +5385,14 @@ again: cond_resched(); goto again; } - if (!ret) { - __btrfs_tree_read_lock(next, - BTRFS_NESTING_RIGHT, - path->recurse); - } + if (!ret) + btrfs_tree_read_lock(next); } break; } path->slots[level] = slot; while (1) { level--; - c = path->nodes[level]; - if (path->locks[level]) - btrfs_tree_read_unlock(c); - - free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) @@ -5408,8 +5411,7 @@ again: } if (!path->skip_locking) - __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, - path->recurse); + btrfs_tree_read_lock(next); } ret = 0; done: -- cgit v1.2.3 From 2f5239dcb26b5037dc21b58fe8bb0e80243f4f6f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:31 -0500 Subject: btrfs: remove btrfs_path::recurse With my async free space cache loading patches ("btrfs: load free space cache asynchronously") we no longer have a user of path->recurse and can remove it. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 ++--- fs/btrfs/ctree.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dc3e87828bd..2a53887a6db7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2533,7 +2533,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = __btrfs_read_lock_root_node(root, p->recurse); + b = __btrfs_read_lock_root_node(root, 0); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2802,8 +2802,7 @@ cow_done: btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, - p->recurse); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, 0); p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 228c5df4b17f..f974f3748816 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,7 +374,6 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; - unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) -- cgit v1.2.3 From 4048daedb910f83f080c6bb03c78af794aebdff5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:32 -0500 Subject: btrfs: locking: remove the recursion handling code Now that we're no longer using recursion, rip out all of the supporting code. Follow up patches will clean up the callers of these functions. The extent_buffer::lock_owner is still retained as it allows safety checks in btrfs_init_new_buffer for the case that the free space cache is corrupted and we try to allocate a block that we are currently using and have locked in the path. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/locking.c | 68 ++++-------------------------------------------------- 1 file changed, 4 insertions(+), 64 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5260660b655a..1e36a66fcefa 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,43 +25,18 @@ * - reader/reader sharing * - try-lock semantics for readers and writers * - * Additionally we need one level nesting recursion, see below. The rwsem - * implementation does opportunistic spinning which reduces number of times the - * locking task needs to sleep. - * - * - * Lock recursion - * -------------- - * - * A write operation on a tree might indirectly start a look up on the same - * tree. This can happen when btrfs_cow_block locks the tree and needs to - * lookup free extents. - * - * btrfs_cow_block - * .. - * alloc_tree_block_no_bg_flush - * btrfs_alloc_tree_block - * btrfs_reserve_extent - * .. - * load_free_space_cache - * .. - * btrfs_lookup_file_extent - * btrfs_search_slot - * + * The rwsem implementation does opportunistic spinning which reduces number of + * times the locking task needs to sleep. */ /* * __btrfs_tree_read_lock - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep - * @recurse: if this lock is able to be recursed + * @recurse: unused * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. - * - * If you specify recurse = true, then we will allow this to be taken if we - * currently own the lock already. This should only be used in specific - * usecases, and the subsequent unlock will not change the state of the lock. */ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, bool recurse) @@ -71,31 +46,7 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); - if (unlikely(recurse)) { - /* First see if we can grab the lock outright */ - if (down_read_trylock(&eb->lock)) - goto out; - - /* - * Ok still doesn't necessarily mean we are already holding the - * lock, check the owner. - */ - if (eb->lock_owner != current->pid) { - down_read_nested(&eb->lock, nest); - goto out; - } - - /* - * Ok we have actually recursed, but we should only be recursing - * once, so blow up if we're already recursed, otherwise set - * ->lock_recursed and carry on. - */ - BUG_ON(eb->lock_recursed); - eb->lock_recursed = true; - goto out; - } down_read_nested(&eb->lock, nest); -out: eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -136,22 +87,11 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } /* - * Release read lock. If the read lock was recursed then the lock stays in the - * original state that it was before it was recursively locked. + * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - /* - * if we're nested, we have the write lock. No new locking - * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_recursed - * field only matters to the lock owner. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) { - eb->lock_recursed = false; - return; - } eb->lock_owner = 0; up_read(&eb->lock); } -- cgit v1.2.3 From 1bb96598410ccd52f4224e5584b8015c6d61b81f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:33 -0500 Subject: btrfs: merge back btrfs_read_lock_root_node helpers We no longer have recursive locking and there's no need for separate helpers that allowed the transition to rwsem with minimal code changes. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- fs/btrfs/locking.c | 5 ++--- fs/btrfs/locking.h | 8 +------- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2a53887a6db7..8148007fa259 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2533,7 +2533,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = __btrfs_read_lock_root_node(root, 0); + b = btrfs_read_lock_root_node(root); level = btrfs_header_level(b); if (level > write_lock_level) goto out; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 1e36a66fcefa..4190b824eca6 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -184,14 +184,13 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, - bool recurse) +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); + btrfs_tree_read_lock(eb); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index f8f2fd835582..91441e31db18 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -94,13 +94,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, - bool recurse); - -static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - return __btrfs_read_lock_root_node(root, false); -} +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { -- cgit v1.2.3 From fe596ca3d3b5b005d940a20ee30a6f1c13dd2d19 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:34 -0500 Subject: btrfs: use btrfs_tree_read_lock in btrfs_search_slot We no longer use recursion, so __btrfs_tree_read_lock(BTRFS_NESTING_NORMAL) == btrfs_tree_read_lock. Replace this call with the simple helper. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8148007fa259..778a6d580545 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2802,7 +2802,7 @@ cow_done: btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, 0); + btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; -- cgit v1.2.3 From 0ecae6fffe66db8d0692469eb22c141bea210291 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:35 -0500 Subject: btrfs: remove the recurse parameter from __btrfs_tree_read_lock It is completely unused now, remove it. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/locking.c | 6 ++---- fs/btrfs/locking.h | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 4190b824eca6..5fafc5e89bb7 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,13 +33,11 @@ * __btrfs_tree_read_lock - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep - * @recurse: unused * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, - bool recurse) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; @@ -53,7 +51,7 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne void btrfs_tree_read_lock(struct extent_buffer *eb) { - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 91441e31db18..a2e1f1f5c6e3 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -87,8 +87,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, - bool recurse); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -- cgit v1.2.3 From a55463c9f0ffa7429d3b0bd3fc2d0b3f31a3d299 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 6 Nov 2020 16:27:36 -0500 Subject: btrfs: remove extent_buffer::recursed It is unused everywhere now, it can be removed. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - fs/btrfs/extent_io.h | 1 - 2 files changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3bbb3bdd395b..574e5cd3d7ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4966,7 +4966,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->fs_info = fs_info; eb->bflags = 0; init_rwsem(&eb->lock); - eb->lock_recursed = false; btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c76697fc3120..cf95e9e4acb9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -87,7 +87,6 @@ struct extent_buffer { int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; - bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; -- cgit v1.2.3 From 76aea5379678f901a7b229fe3e3434e594ec8e4d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:53 +0200 Subject: btrfs: make btrfs_inode_safe_disk_i_size_write take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file-item.c | 18 +++++++++--------- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/reflink.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f974f3748816..560cbc46e31d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3042,7 +3042,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); -void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 40daf1a4b46c..1a5651bebbaf 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -38,27 +38,27 @@ * Finally new_i_size should only be set in the case of truncate where we're not * ready to use i_size_read() as the limiter yet. */ -void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start, end, i_size; int ret; - i_size = new_i_size ?: i_size_read(inode); + i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - BTRFS_I(inode)->disk_i_size = i_size; + inode->disk_i_size = i_size; return; } - spin_lock(&BTRFS_I(inode)->lock); - ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, - &start, &end, EXTENT_DIRTY); + spin_lock(&inode->lock); + ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); else i_size = 0; - BTRFS_I(inode)->disk_i_size = i_size; - spin_unlock(&BTRFS_I(inode)->lock); + inode->disk_i_size = i_size; + spin_unlock(&inode->lock); } /** diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8a9056b6e2ad..096be647c234 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3073,7 +3073,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode(trans, root, inode); ret2 = btrfs_end_transaction(trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2db11ab4ecbf..c6bc76830cd4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2760,7 +2760,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -2835,7 +2835,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 0, 0, &cached_state); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -4633,7 +4633,7 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_inode_safe_disk_i_size_write(inode, last_size); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), last_size); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); } @@ -4995,7 +4995,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } i_size_write(inode, newsize); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); btrfs_drew_write_unlock(&root->snapshot_lock); @@ -8558,7 +8558,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } if (trans) { @@ -9789,7 +9789,7 @@ next: else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, root, inode); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 4bbc5f52b752..952b8f33619c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -31,7 +31,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, root, inode); -- cgit v1.2.3 From 90dffd0cff894b8b50015b19a515b93553c694ba Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:54 +0200 Subject: btrfs: make insert_prealloc_file_extent take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c6bc76830cd4..1d44e63160b4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9623,7 +9623,8 @@ out_unlock: static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_trans_handle *trans_in, - struct inode *inode, struct btrfs_key *ins, + struct btrfs_inode *inode, + struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; @@ -9644,12 +9645,12 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); + ret = btrfs_qgroup_release_data(inode, file_offset, len); if (ret < 0) return ERR_PTR(ret); if (trans) { - ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, true, ret); if (ret) @@ -9671,7 +9672,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (!path) return ERR_PTR(-ENOMEM); - ret = btrfs_replace_file_extents(inode, path, file_offset, + ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); @@ -9727,7 +9728,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, clear_offset += ins.offset; last_alloc = ins.offset; - trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), + &ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. -- cgit v1.2.3 From 507433985caf668ca6205570c84520913455966c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:55 +0200 Subject: btrfs: make btrfs_truncate_inode_items take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 43 +++++++++++++++++++++++-------------------- fs/btrfs/tree-log.c | 5 ++--- 4 files changed, 27 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 560cbc46e31d..ccc2339c6568 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3072,7 +3072,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, u64 new_size, + struct btrfs_inode *inode, u64 new_size, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 58bd2d3e54db..d8010df20b58 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -273,7 +273,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 0, BTRFS_EXTENT_DATA_KEY); if (ret) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d44e63160b4..3dd0b472501e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4299,7 +4299,7 @@ out: */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, u64 new_size, u32 min_type) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4320,7 +4320,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_slot = 0; int extent_type = -1; int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); + u64 ino = btrfs_ino(inode); u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; @@ -4334,7 +4334,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * off from time to time. This means all inodes in subvolume roots, * reloc roots, and data reloc roots. */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && + if (!btrfs_is_free_space_inode(inode) && test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; @@ -4344,7 +4344,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* @@ -4352,7 +4352,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * new size is not block aligned since we will be keeping the * last block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, + btrfs_drop_extent_cache(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); } @@ -4363,8 +4363,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ - if (min_type == 0 && root == BTRFS_I(inode)->root) - btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + if (min_type == 0 && root == inode->root) + btrfs_kill_delayed_inode_items(inode); key.objectid = ino; key.offset = (u64)-1; @@ -4420,14 +4420,13 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); trace_btrfs_truncate_show_fi_regular( - BTRFS_I(inode), leaf, fi, - found_key.offset); + inode, leaf, fi, found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_ram_bytes(leaf, fi); trace_btrfs_truncate_show_fi_inline( - BTRFS_I(inode), leaf, fi, path->slots[0], + inode, leaf, fi, path->slots[0], found_key.offset); } item_end--; @@ -4466,7 +4465,8 @@ search_again: if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); btrfs_mark_buffer_dirty(leaf); } else { extent_num_bytes = @@ -4481,7 +4481,8 @@ search_again: found_extent = 1; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); } } clear_len = num_dec; @@ -4516,7 +4517,8 @@ search_again: } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); + inode_sub_bytes(&inode->vfs_inode, + item_end + 1 - new_size); } delete: /* @@ -4524,8 +4526,8 @@ delete: * multiple fsyncs, and in this case we don't want to clear the * file extent range because it's just the log. */ - if (root == BTRFS_I(inode)->root) { - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + if (root == inode->root) { + ret = btrfs_inode_clear_file_extent_range(inode, clear_start, clear_len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4633,9 +4635,9 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + btrfs_inode_safe_disk_i_size_write(inode, last_size); + unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, + &cached_state); } btrfs_free_path(path); @@ -5268,7 +5270,8 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), + 0, 0); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -8512,7 +8515,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 932a74a236eb..360732a93980 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4378,8 +4378,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, do { ret = btrfs_truncate_inode_items(trans, root->log_root, - &inode->vfs_inode, - truncate_offset, + inode, truncate_offset, BTRFS_EXTENT_DATA_KEY); } while (ret == -EAGAIN); if (ret) @@ -5306,7 +5305,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, &inode->vfs_inode, 0, 0); + log, inode, 0, 0); if (ret != -EAGAIN) break; } -- cgit v1.2.3 From 72e7e6edd376facb350a4211e400518daffa3d08 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:56 +0200 Subject: btrfs: make btrfs_finish_ordered_io btrfs_inode-centric Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3dd0b472501e..0e0a1c8c1255 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2717,11 +2717,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { - struct inode *inode = ordered_extent->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; @@ -2740,14 +2740,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; - freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); + freespace_inode = btrfs_is_free_space_inode(inode); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), start, end); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -2760,7 +2760,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -2770,8 +2770,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); + trans->block_rsv = &inode->block_rsv; + ret = btrfs_update_inode_fallback(trans, root, &inode->vfs_inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -2790,13 +2790,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), + ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); @@ -2810,8 +2810,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, + unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2831,19 +2830,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, + clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 0, 0, &cached_state); - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode_fallback(trans, root, inode); + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, root, &inode->vfs_inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + clear_extent_bit(&inode->io_tree, start, end, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); @@ -2858,7 +2857,7 @@ out: clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); + btrfs_drop_extent_cache(inode, unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -2893,7 +2892,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); + btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); -- cgit v1.2.3 From f3fbcaef59927b811a5219e4201510e2df11e6ac Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:57 +0200 Subject: btrfs: make btrfs_delayed_update_inode take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 12 +++++++----- fs/btrfs/delayed-inode.h | 3 ++- fs/btrfs/inode.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b2a883ec8e7e..70c0340d839c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1815,27 +1815,29 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, + &inode->vfs_inode); goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode), + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, delayed_node); if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index ca96ef007d8f..b2412160c5bc 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -110,7 +110,8 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); + struct btrfs_root *root, + struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e0a1c8c1255..32250919f177 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3674,7 +3674,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, inode); + ret = btrfs_delayed_update_inode(trans, root, BTRFS_I(inode)); if (!ret) btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); return ret; -- cgit v1.2.3 From dfeb9e7cc3ed0d7e4a307e9c4a714c2950b2a9e4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:58 +0200 Subject: btrfs: make btrfs_update_inode_item take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 32250919f177..1b2bb73d1d99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3621,7 +3621,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; @@ -3632,8 +3633,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, - 1); + ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -3644,9 +3644,9 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, leaf, inode_item, inode); + fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: btrfs_free_path(path); @@ -3680,7 +3680,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, return ret; } - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); } noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -3691,7 +3691,7 @@ noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); return ret; } -- cgit v1.2.3 From 9a56fcd15a9c6b580a21c439deab60bb4cd2cfd9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:59 +0200 Subject: btrfs: make btrfs_update_inode take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/ctree.h | 3 +-- fs/btrfs/file.c | 8 +++---- fs/btrfs/free-space-cache.c | 6 ++--- fs/btrfs/inode-map.c | 2 +- fs/btrfs/inode.c | 57 +++++++++++++++++++++++---------------------- fs/btrfs/ioctl.c | 6 ++--- fs/btrfs/reflink.c | 2 +- fs/btrfs/tree-log.c | 12 +++++----- fs/btrfs/xattr.c | 4 ++-- 10 files changed, 51 insertions(+), 51 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 35be6dbca5e8..ccc3271c20ca 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2405,7 +2405,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ccc2339c6568..ea443d3215d0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3112,8 +3112,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); + struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 096be647c234..39beab4c6d74 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2741,7 +2741,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, cur_offset = drop_args.drop_end; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -2978,7 +2978,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -3005,7 +3005,7 @@ out_only_mutex: } else { int ret2; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -3074,7 +3074,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d8010df20b58..572c75d2169b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -278,7 +278,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); fail: if (locked) @@ -1193,7 +1193,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1383,7 +1383,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 76d2e43817ea..8cf39402b227 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -463,7 +463,7 @@ again: } BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_put; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b2bb73d1d99..ed0249bdaf46 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -382,7 +382,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -3657,7 +3657,8 @@ failed: * copy everything in the in-memory inode into the btree. */ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3669,18 +3670,18 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) + if (!btrfs_is_free_space_inode(inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); + return btrfs_update_inode_item(trans, root, inode); } noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -3689,7 +3690,7 @@ noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); return ret; @@ -3799,7 +3800,7 @@ err: inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &dir->vfs_inode); + ret = btrfs_update_inode(trans, root, dir); out: return ret; } @@ -3813,7 +3814,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); } return ret; } @@ -4836,7 +4837,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(BTRFS_I(inode), 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); return ret; @@ -4998,7 +4999,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5910,7 +5911,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -5918,7 +5919,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); if (BTRFS_I(inode)->delayed_node) @@ -6306,7 +6307,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_inode->vfs_inode.i_mtime = now; parent_inode->vfs_inode.i_ctime = now; } - ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); + ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6397,7 +6398,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); d_instantiate_new(dentry, inode); out_unlock: @@ -6456,7 +6457,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_unlock; @@ -6528,7 +6529,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -6596,7 +6597,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_fail; @@ -8521,7 +8522,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -8567,7 +8568,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret2 && !ret) ret = ret2; @@ -8613,7 +8614,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); - err = btrfs_update_inode(trans, new_root, inode); + err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); iput(inode); return err; @@ -8969,7 +8970,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8985,7 +8986,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, dest, new_inode); + ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9105,7 +9106,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: unlock_new_inode(inode); if (ret) @@ -9239,7 +9240,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9599,7 +9600,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); /* * Last step, add directory indexes for our symlink inode. This is the * last step to avoid extra cleanup of these indexes if an error happens @@ -9796,7 +9797,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9892,7 +9893,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) goto out; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e7ccba83e8a5..a5dc7cc5d705 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -336,7 +336,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -479,7 +479,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_end_transaction(trans); @@ -733,7 +733,7 @@ static noinline int create_subvol(struct inode *dir, } btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 952b8f33619c..be16acd87588 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -34,7 +34,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 360732a93980..955c9a36cfeb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -834,7 +834,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: if (inode) iput(inode); @@ -1533,7 +1533,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1708,7 +1708,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1814,7 +1814,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } else { @@ -1967,7 +1967,7 @@ out: btrfs_release_path(path); if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } kfree(name); iput(dir); @@ -2602,7 +2602,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, inode); + root, BTRFS_I(inode)); } iput(inode); if (ret) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 95d9aebff2c4..f32fe9e39a74 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -239,7 +239,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); out: btrfs_end_transaction(trans); @@ -390,7 +390,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (!ret) { inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); } -- cgit v1.2.3 From a4ba6cc03eba9d6a64cb72bb487a97ef26a7d620 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:00 +0200 Subject: btrfs: make maybe_insert_hole take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ed0249bdaf46..732a5fab471d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4792,10 +4792,10 @@ out: return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, +static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, u64 offset, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; @@ -4805,9 +4805,9 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, * that any holes get logged if we fsync. */ if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = root->log_transid; - BTRFS_I(inode)->last_log_commit = root->last_log_commit; + inode->last_trans = fs_info->generation; + inode->last_sub_trans = root->log_transid; + inode->last_log_commit = root->last_log_commit; return 0; } @@ -4824,20 +4824,20 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, drop_args.end = offset + len; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) { btrfs_abort_transaction(trans, ret); } else { - btrfs_update_inode_bytes(BTRFS_I(inode), 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); + btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); return ret; @@ -4894,8 +4894,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, inode, cur_offset, - hole_size); + err = maybe_insert_hole(root, BTRFS_I(inode), + cur_offset, hole_size); if (err) break; -- cgit v1.2.3 From dea46d84a3cc25553ed8be4114bd559e8c8c55c4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:01 +0200 Subject: btrfs: make find_first_non_hole take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 39beab4c6d74..0abf8677bf34 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2450,13 +2450,13 @@ out: * em->start + em->len > start) * When a hole extent is found, return 1 and modify start/len. */ -static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + em = btrfs_get_extent(inode, NULL, 0, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2761,7 +2761,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = rsv; if (!extent_info) { - ret = find_first_non_hole(inode, &cur_offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &cur_offset, + &len); if (unlikely(ret < 0)) break; if (ret && !len) { @@ -2873,7 +2874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); ino_size = round_up(inode->i_size, fs_info->sectorsize); - ret = find_first_non_hole(inode, &offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { @@ -2923,7 +2924,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* after truncate page, check hole again */ len = offset + len - lockstart; offset = lockstart; - ret = find_first_non_hole(inode, &offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { @@ -2937,7 +2938,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start = lockend + 1; tail_len = offset + len - tail_start; if (tail_len) { - ret = find_first_non_hole(inode, &tail_start, &tail_len); + ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); if (unlikely(ret < 0)) goto out_only_mutex; if (!ret) { -- cgit v1.2.3 From 03fcb1ab6f265725f13be932c7c24ca4ccb1a703 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:02 +0200 Subject: btrfs: make btrfs_insert_replace_extent take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0abf8677bf34..51a61cd9fa49 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2516,14 +2516,14 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, const u64 replace_len, const u64 bytes_to_drop) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2536,11 +2536,11 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) { - btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } - key.objectid = btrfs_ino(BTRFS_I(inode)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -2561,25 +2561,25 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - extent_info->file_offset, replace_len); + ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, + replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ if (extent_info->disk_offset == 0) { - btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } - btrfs_update_inode_bytes(BTRFS_I(inode), replace_len, bytes_to_drop); + btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, - btrfs_ino(BTRFS_I(inode)), + btrfs_ino(inode), extent_info->file_offset, extent_info->qgroup_reserved, &key); @@ -2591,7 +2591,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); + btrfs_ino(inode), ref_offset); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2727,9 +2727,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, u64 replace_len = drop_args.drop_end - extent_info->file_offset; - ret = btrfs_insert_replace_extent(trans, inode, path, - extent_info, replace_len, - drop_args.bytes_found); + ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), + path, extent_info, replace_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -2826,9 +2826,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, - extent_info->data_len, - drop_args.bytes_found); + ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path, + extent_info, extent_info->data_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; -- cgit v1.2.3 From 217f42eb3d321447910c45fc2bb2292aa0839fd6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:03 +0200 Subject: btrfs: make btrfs_truncate_block take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/file.c | 17 ++++++++++------- fs/btrfs/inode.c | 52 +++++++++++++++++++++++++--------------------------- 3 files changed, 37 insertions(+), 36 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea443d3215d0..cd86e15a5e16 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3068,8 +3068,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 51a61cd9fa49..696755055a25 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2899,7 +2899,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, offset, len, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); } else { ret = 0; } @@ -2909,7 +2910,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { inode_unlock(inode); return ret; @@ -2945,7 +2946,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, + ret = btrfs_truncate_block(BTRFS_I(inode), tail_start + tail_len, 0, 1); if (ret) @@ -3187,7 +3188,8 @@ static int btrfs_zero_range(struct inode *inode, } if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { free_extent_map(em); - ret = btrfs_truncate_block(inode, offset, len, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, @@ -3218,7 +3220,7 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(inode, offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) goto out; } else { @@ -3235,7 +3237,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(inode, offset + len, 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, + 0, 1); if (ret) goto out; } else { @@ -3355,7 +3358,7 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); if (ret) goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 732a5fab471d..e13442bff6f9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4655,12 +4655,12 @@ out: * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ -int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; @@ -4683,30 +4683,29 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, - block_start, blocksize); + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize); if (ret < 0) { - if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, - &write_bytes) > 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize); if (ret < 0) { if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, block_start, blocksize); + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); goto out; } again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + btrfs_delalloc_release_space(inode, data_reserved, block_start, + blocksize, true); + btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } @@ -4729,7 +4728,7 @@ again: lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start); + ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -4740,11 +4739,11 @@ again: goto again; } - clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, + clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -4770,24 +4769,23 @@ again: unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, - block_end, EXTENT_NORESERVE, NULL, GFP_NOFS); + set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL, GFP_NOFS); out_unlock: if (ret) { if (only_release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - blocksize, true); + btrfs_delalloc_release_metadata(inode, blocksize, true); else - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); } - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + btrfs_delalloc_release_extents(inode, blocksize); unlock_page(page); put_page(page); out: if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); + btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret; } @@ -4869,7 +4867,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); + err = btrfs_truncate_block(BTRFS_I(inode), oldsize, 0, 0); if (err) return err; @@ -8553,7 +8551,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); -- cgit v1.2.3 From b06359a3258924403178eee9ac0c5f0482981918 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:04 +0200 Subject: btrfs: make btrfs_cont_expand take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 35 +++++++++++++++++------------------ fs/btrfs/reflink.c | 2 +- 4 files changed, 21 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd86e15a5e16..67923c9e9ac3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3118,7 +3118,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 696755055a25..d808928dded6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1628,7 +1628,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); - ret = btrfs_cont_expand(inode, oldsize, end_pos); + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) { current->backing_dev_info = NULL; return ret; @@ -3348,7 +3348,7 @@ static long btrfs_fallocate(struct file *file, int mode, * But that's a minor problem and won't do much harm BTW. */ if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, i_size_read(inode), + ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), alloc_start); if (ret) goto out; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e13442bff6f9..6561a254710c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4847,14 +4847,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size */ -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; @@ -4867,18 +4867,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(BTRFS_I(inode), oldsize, 0, 0); + err = btrfs_truncate_block(inode, oldsize, 0, 0); if (err) return err; if (size <= hole_start) return 0; - btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, - block_end - 1, &cached_state); + btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, + &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); @@ -4892,22 +4892,22 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, BTRFS_I(inode), - cur_offset, hole_size); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); if (err) break; - err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); goto next; } hole_em->start = cur_offset; @@ -4927,14 +4927,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) write_unlock(&em_tree->lock); if (err != -EEXIST) break; - btrfs_drop_extent_cache(BTRFS_I(inode), - cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); } free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; @@ -4982,7 +4981,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * this truncation. */ btrfs_drew_write_lock(&root->snapshot_lock); - ret = btrfs_cont_expand(inode, oldsize, newsize); + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) { btrfs_drew_write_unlock(&root->snapshot_lock); return ret; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index be16acd87588..ab80896315be 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -654,7 +654,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - ret = btrfs_cont_expand(inode, inode->i_size, destoff); + ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* -- cgit v1.2.3 From 729f7961729a94e15bcaeeb2a407efb570091ea6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:49:06 +0200 Subject: btrfs: make btrfs_update_inode_fallback take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 15 +++++++-------- fs/btrfs/transaction.c | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67923c9e9ac3..c0c6e79c43f9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3114,7 +3114,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); + struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6561a254710c..bee9a669d172 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2771,7 +2771,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -2835,7 +2835,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 0, 0, &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode_fallback(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3684,15 +3684,14 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, return btrfs_update_inode_item(trans, root, inode); } -noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode) { int ret; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); + return btrfs_update_inode_item(trans, root, inode); return ret; } @@ -3963,7 +3962,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = current_time(dir); - ret = btrfs_update_inode_fallback(trans, root, dir); + ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); out: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b671ea4d80e1..0e4063651047 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1721,7 +1721,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = current_time(parent_inode); - ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); + ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; -- cgit v1.2.3 From 1cab5e728313c0ecdabbcaa7cc1456c66f351d49 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 5 Nov 2020 11:08:00 +0200 Subject: btrfs: merge __set_extent_bit and set_extent_bit There are only 2 direct calls to set_extent_bit outside of extent-io - in btrfs_find_new_delalloc_bytes and btrfs_truncate_block, the rest are thin wrappers around __set_extent_bit. This adds unnecessary indirection and just makes it more annoying when looking at the various extent bit manipulation functions. This patch renames __set_extent_bit to set_extent_bit effectively removing a level of indirection. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ reformat and remove __must_check ] Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 21 +++++++++++++-------- fs/btrfs/extent_io.c | 36 +++++++++++++----------------------- fs/btrfs/inode.c | 6 +++--- 3 files changed, 29 insertions(+), 34 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 5334b3772f18..45e7f0703d5b 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -168,14 +168,17 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state, gfp_t mask); + unsigned bits, unsigned exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, + NULL); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -188,7 +191,8 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL, + mask, NULL); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -209,7 +213,7 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - cached_state, GFP_NOFS); + 0, NULL, cached_state, GFP_NOFS, NULL); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, @@ -217,20 +221,21 @@ static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - cached_state, GFP_NOFS); + 0, NULL, cached_state, GFP_NOFS, NULL); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL, + GFP_NOFS, NULL); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + cached_state, mask, NULL); } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 574e5cd3d7ea..07db1af9d7d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -961,12 +961,10 @@ static void cache_state(struct extent_state *state, * * [start, end] is inclusive This takes the tree lock. */ - -static int __must_check -__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned exclusive_bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, + unsigned exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1183,14 +1181,6 @@ out: } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state, gfp_t mask) -{ - return __set_extent_bit(tree, start, end, bits, 0, NULL, cached_state, - mask, NULL); -} - - /** * convert_extent_bit - convert all bits in a given range from one bit to * another @@ -1421,15 +1411,15 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, + changeset); } int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits) { - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); } int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1464,9 +1454,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1482,8 +1472,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bee9a669d172..5b9815b2c3ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2340,8 +2340,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); + EXTENT_DELALLOC_NEW, 0, NULL, cached_state, + GFP_NOFS, NULL); next: search_start = extent_map_end(em); free_extent_map(em); @@ -4769,7 +4769,7 @@ again: if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL, GFP_NOFS); + EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); out_unlock: if (ret) { -- cgit v1.2.3 From f2f121ab500d0457cc9c6f54269d21ffdf5bd304 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Nov 2020 11:21:49 +0000 Subject: btrfs: skip unnecessary searches for xattrs when logging an inode Every time we log an inode we lookup in the fs/subvol tree for xattrs and if we have any, log them into the log tree. However it is very common to have inodes without any xattrs, so doing the search wastes times, but more importantly it adds contention on the fs/subvol tree locks, either making the logging code block and wait for tree locks or making the logging code making other concurrent operations block and wait. The most typical use cases where xattrs are used are when capabilities or ACLs are defined for an inode, or when SELinux is enabled. This change makes the logging code detect when an inode does not have xattrs and skip the xattrs search the next time the inode is logged, unless the inode is evicted and loaded again or a xattr is added to the inode. Therefore skipping the search for xattrs on inodes that don't ever have xattrs and are fsynced with some frequency. The following script that calls dbench was used to measure the impact of this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device directly (no intermediary filesystem on the host) and using a non-debug kernel (default configuration on Debian distributions): $ cat test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" mkfs.btrfs -f -m single -d single $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 200 40 umount $MNT The results before this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 5761605 0.172 312.057 Close 4232452 0.002 10.927 Rename 243937 1.406 277.344 Unlink 1163456 0.631 298.402 Deltree 160 11.581 221.107 Mkdir 80 0.003 0.005 Qpathinfo 5221410 0.065 122.309 Qfileinfo 915432 0.001 3.333 Qfsinfo 957555 0.003 3.992 Sfileinfo 469244 0.023 20.494 Find 2018865 0.448 123.659 WriteX 2874851 0.049 118.529 ReadX 9030579 0.004 21.654 LockX 18754 0.003 4.423 UnlockX 18754 0.002 0.331 Flush 403792 10.944 359.494 Throughput 908.444 MB/sec 40 clients 40 procs max_latency=359.500 ms The results after this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 6442521 0.159 230.693 Close 4732357 0.002 10.972 Rename 272809 1.293 227.398 Unlink 1301059 0.563 218.500 Deltree 160 7.796 54.887 Mkdir 80 0.008 0.478 Qpathinfo 5839452 0.047 124.330 Qfileinfo 1023199 0.001 4.996 Qfsinfo 1070760 0.003 5.709 Sfileinfo 524790 0.033 21.765 Find 2257658 0.314 125.611 WriteX 3211520 0.040 232.135 ReadX 10098969 0.004 25.340 LockX 20974 0.003 1.569 UnlockX 20974 0.002 3.475 Flush 451553 10.287 331.037 Throughput 1011.77 MB/sec 40 clients 40 procs max_latency=331.045 ms +10.8% throughput, -8.2% max latency Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 7 +++++++ fs/btrfs/tree-log.c | 8 ++++++++ fs/btrfs/xattr.c | 4 +++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b4c09a12659c..555cbcef6585 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -35,6 +35,13 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, + /* + * Set and used when logging an inode and it serves to signal that an + * inode does not have xattrs, so subsequent fsyncs can avoid searching + * for xattrs to log. This bit must be cleared whenever a xattr is added + * to an inode. + */ + BTRFS_INODE_NO_XATTRS, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955c9a36cfeb..4b8f190c39bb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4573,6 +4573,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); int ins_nr = 0; int start_slot = 0; + bool found_xattrs = false; + + if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) + return 0; key.objectid = ino; key.type = BTRFS_XATTR_ITEM_KEY; @@ -4611,6 +4615,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, start_slot = slot; ins_nr++; path->slots[0]++; + found_xattrs = true; cond_resched(); } if (ins_nr > 0) { @@ -4620,6 +4625,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, return ret; } + if (!found_xattrs) + set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); + return 0; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f32fe9e39a74..af6246f36a9e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -213,9 +213,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, } out: btrfs_free_path(path); - if (!ret) + if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); + } return ret; } -- cgit v1.2.3 From bc5b5b1e5111005363094da1d5f5ffb0e83165f1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Nov 2020 11:23:30 +0000 Subject: btrfs: stop incrementing log batch when joining log transaction When joining a log transaction we acquire the root's log mutex, then increment the root's log batch and log writers counters while holding the mutex. However we don't need to increment the log batch there, because we are holding the mutex and incremented the log writers counter as well, so any other task trying to sync log will wait for the current task to finish its logging and still achieve the desired log batching. Since the log batch counter is an atomic counter and is incremented twice at the very beginning of the fsync callback (btrfs_sync_file()), once before flushing delalloc and once again after waiting for writeback to complete, eliminating its increment when joining the log transaction may provide some performance gains in case we have multiple concurrent tasks doing fsyncs against different files in the same subvolume, as it reduces contention on the atomic (locking the cacheline and bouncing it). When testing fio with 32 jobs, on a 8 cores VM, doing fsyncs against different files of the same subvolume, on top of a zram device, I could consistently see gains (higher throughput) between 1% to 2%, which is a very low value and possibly hard to be observed with a real device (I couldn't observe consistent gains with my low/mid end NVMe device). So this change is mostly motivated to just simplify the logic, as updating the log batch counter is only relevant when an fsync starts and while not holding the root's log mutex. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4b8f190c39bb..80cf496226c2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -172,7 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_start_pid = current->pid; } - atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; -- cgit v1.2.3 From f30bed83426c5cb9fce6cabb3f7cc5a9d5428fcc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Nov 2020 11:24:17 +0000 Subject: btrfs: remove unnecessary attempt to drop extent maps after adding inline extent At inode.c:cow_file_range_inline(), after we insert the inline extent in the fs/subvolume btree, we call btrfs_drop_extent_cache() to drop all extent maps in the file range, however that is not necessary because we have already done it in the call to btrfs_drop_extents(), which calls btrfs_drop_extent_cache() for us, and since at this point we have the file range locked in the inode's iotree (we are in the writeback path), we know no other task can come in and read stale file extent items or find none and therefore create either stale extent maps or an extent map that represents a hole. So just remove that unnecessary call to btrfs_drop_extent_cache(), as it's doing nothing and only wasting time. This call has been around since 2008, introduced in commit c8b978188c9a ("Btrfs: Add zlib compression support"), but even back then it seems it was not necessary, since we had the range locked in the inode's iotree and the call to btrfs_drop_extents() already used to always call btrfs_drop_extent_cache(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9815b2c3ba..abc0fd162f6c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -392,7 +392,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); - btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent -- cgit v1.2.3 From d5286a92ea76a124d6cfa1e261394da858125d99 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 12 Nov 2020 13:24:02 +0200 Subject: btrfs: remove useless return value statement in split_node At the point when we set 'ret = 0' it's guaranteed that the function is going to return 0 so directly return 0. No functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 778a6d580545..60135f6e6441 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3459,7 +3459,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - ret = 0; btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); @@ -3477,7 +3476,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(split); free_extent_buffer(split); } - return ret; + return 0; } /* -- cgit v1.2.3 From 95b982de37473a99add60c9e68b12d54354f1da7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 13 Nov 2020 09:29:40 +0200 Subject: btrfs: simplify return values in setup_nodes_for_search The function is needlessly convoluted. Fix that by: * removing redundant sret variable definition in both if arms * replace the again/done labels with direct return statements, the function is short enough and doesn't do anything special upon exit * remove BUG_ON on split_node returning a positive number - it can't happen as split_node returns either 0 or a negative error code. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60135f6e6441..e5a0941c4bde 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2387,56 +2387,42 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, int *write_lock_level) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + int ret = 0; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { - int sret; if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } reada_for_balance(p, level); - sret = split_node(trans, root, p, level); + ret = split_node(trans, root, p, level); - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { - int sret; if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } reada_for_balance(p, level); - sret = balance_level(trans, root, p, level); + ret = balance_level(trans, root, p, level); + if (ret) + return ret; - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; if (!b) { btrfs_release_path(p); - goto again; + return -EAGAIN; } BUG_ON(btrfs_header_nritems(b) == 1); } - return 0; - -again: - ret = -EAGAIN; -done: return ret; } -- cgit v1.2.3 From 445d8ab53f69f4c4062b668c6a25b88a79753136 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 1 Nov 2020 07:30:08 -0800 Subject: btrfs: sysfs: remove unneeded semicolon A semicolon is not needed after a switch statement. Signed-off-by: Tom Rix Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 640d17f6ace1..293338153c20 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1311,7 +1311,7 @@ static const char *alloc_name(u64 flags) default: WARN_ON(1); return "invalid-combination"; - }; + } } /* -- cgit v1.2.3 From b1d51f67c942a9254d7b69ebf5ab8329cb5c2b8c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:27 +0800 Subject: btrfs: tests: remove invalid extent-io test In extent-io-test, there are two invalid tests: - Invalid nodesize for test_eb_bitmaps() Instead of the sectorsize and nodesize combination passed in, we're always using hand-crafted nodesize, e.g: len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) ? sectorsize * 4 : sectorsize; In above case, if we have 32K page size, then we will get a length of 128K, which is beyond max node size, and obviously invalid. The common page size goes up to 64K so we haven't hit that - Invalid extent buffer bytenr For 64K page size, the only combination we're going to test is sectorsize = nodesize = 64K. However, in that case we will try to test an eb which bytenr is not sectorsize aligned: /* Do it over again with an extent buffer which isn't page-aligned. */ eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len); Sector alignment is a hard requirement for any sector size. The only exception is superblock. But anything else should follow sector size alignment. This is definitely an invalid test case. This patch will fix both problems by: - Honor the sectorsize/nodesize combination Now we won't bother to hand-craft the length and use it as nodesize. - Use sectorsize as the 2nd run extent buffer start This would test the case where extent buffer is aligned to sectorsize but not always aligned to nodesize. Please note that, later subpage related cleanup will reduce extent_buffer::pages[] to exactly what we need, making the sector unaligned extent buffer operations cause problems. Since only extent_io self tests utilize this, this patch is required for all later cleanup/refactoring. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/extent-io-tests.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index df7ce874a74b..73e96d505f4f 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -379,54 +379,50 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - unsigned long len; unsigned long *bitmap = NULL; struct extent_buffer *eb = NULL; int ret; test_msg("running extent buffer bitmap tests"); - /* - * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than - * BTRFS_MAX_METADATA_BLOCKSIZE. - */ - len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) - ? sectorsize * 4 : sectorsize; - - fs_info = btrfs_alloc_dummy_fs_info(len, len); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } - bitmap = kmalloc(len, GFP_KERNEL); + bitmap = kmalloc(nodesize, GFP_KERNEL); if (!bitmap) { test_err("couldn't allocate test bitmap"); ret = -ENOMEM; goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, len); + eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } - ret = __test_eb_bitmaps(bitmap, eb, len); + ret = __test_eb_bitmaps(bitmap, eb, nodesize); if (ret) goto out; - /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len); + + /* + * Test again for case where the tree block is sectorsize aligned but + * not nodesize aligned. + */ + eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } - ret = __test_eb_bitmaps(bitmap, eb, len); + ret = __test_eb_bitmaps(bitmap, eb, nodesize); out: free_extent_buffer(eb); kfree(bitmap); -- cgit v1.2.3 From 94e8c95ccba8bc25b6385b8c2ba1b9cd90e86de6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:28 +0800 Subject: btrfs: add structure to keep track of extent range in end_bio_extent_readpage In end_bio_extent_readpage() we had a strange dance around extent_start/extent_len. Hidden behind the strange dance is, it's just calling endio_readpage_release_extent() on each bvec range. Here is an example to explain the original work flow: Bio is for inode 257, containing 2 pages, for range [1M, 1M+8K) end_bio_extent_extent_readpage() entered |- extent_start = 0; |- extent_end = 0; |- bio_for_each_segment_all() { | |- /* Got the 1st bvec */ | |- start = SZ_1M; | |- end = SZ_1M + SZ_4K - 1; | |- update = 1; | |- if (extent_len == 0) { | | |- extent_start = start; /* SZ_1M */ | | |- extent_len = end + 1 - start; /* SZ_1M */ | | } | | | |- /* Got the 2nd bvec */ | |- start = SZ_1M + 4K; | |- end = SZ_1M + 4K - 1; | |- update = 1; | |- if (extent_start + extent_len == start) { | | |- extent_len += end + 1 - start; /* SZ_8K */ | | } | } /* All bio vec iterated */ | |- if (extent_len) { |- endio_readpage_release_extent(tree, extent_start, extent_len, update); /* extent_start == SZ_1M, extent_len == SZ_8K, uptodate = 1 */ As the above flow shows, the existing code in end_bio_extent_readpage() is accumulates extent_start/extent_len, and when the contiguous range stops, calls endio_readpage_release_extent() for the range. However current behavior has something not really considered: - The inode can change For bio, its pages don't need to have contiguous page_offset. This means, even pages from different inodes can be packed into one bio. - bvec cross page boundary There is a feature called multi-page bvec, where bvec->bv_len can go beyond bvec->bv_page boundary. - Poor readability This patch will address the problem: - Introduce a proper structure, processed_extent, to record processed extent range - Integrate inode/start/end/uptodate check into endio_readpage_release_extent() - Add more comment on each step. This should greatly improve the readability, now in end_bio_extent_readpage() there are only two endio_readpage_release_extent() calls. - Add inode check for contiguity Now we also ensure the inode is the same one before checking if the range is contiguous. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 107 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 34 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 07db1af9d7d5..519f74502a3e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2769,16 +2769,77 @@ static void end_bio_extent_writepage(struct bio *bio) bio_put(bio); } -static void -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, - int uptodate) +/* + * Record previously processed extent range + * + * For endio_readpage_release_extent() to handle a full extent range, reducing + * the extent io operations. + */ +struct processed_extent { + struct btrfs_inode *inode; + /* Start of the range in @inode */ + u64 start; + /* End of the range in in @inode */ + u64 end; + bool uptodate; +}; + +/* + * Try to release processed extent range + * + * May not release the extent range right now if the current range is + * contiguous to processed extent. + * + * Will release processed extent when any of @inode, @uptodate, the range is + * no longer contiguous to the processed range. + * + * Passing @inode == NULL will force processed extent to be released. + */ +static void endio_readpage_release_extent(struct processed_extent *processed, + struct btrfs_inode *inode, u64 start, u64 end, + bool uptodate) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + struct extent_io_tree *tree; + + /* The first extent, initialize @processed */ + if (!processed->inode) + goto update; - if (uptodate && tree->track_uptodate) - set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, start, end, &cached); + /* + * Contiguous to processed extent, just uptodate the end. + * + * Several things to notice: + * + * - bio can be merged as long as on-disk bytenr is contiguous + * This means we can have page belonging to other inodes, thus need to + * check if the inode still matches. + * - bvec can contain range beyond current page for multi-page bvec + * Thus we need to do processed->end + 1 >= start check + */ + if (processed->inode == inode && processed->uptodate == uptodate && + processed->end + 1 >= start && end >= processed->end) { + processed->end = end; + return; + } + + tree = &processed->inode->io_tree; + /* + * Now we don't have range contiguous to the processed range, release + * the processed range now. + */ + if (processed->uptodate && tree->track_uptodate) + set_extent_uptodate(tree, processed->start, processed->end, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(tree, processed->start, processed->end, + &cached); + +update: + /* Update processed to current range */ + processed->inode = inode; + processed->start = start; + processed->end = end; + processed->uptodate = uptodate; } /* @@ -2798,12 +2859,11 @@ static void end_bio_extent_readpage(struct bio *bio) int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; + struct processed_extent processed = { 0 }; u64 offset = 0; u64 start; u64 end; u64 len; - u64 extent_start = 0; - u64 extent_len = 0; int mirror; int ret; struct bvec_iter_all iter_all; @@ -2912,32 +2972,11 @@ readpage_ok: unlock_page(page); offset += len; - if (unlikely(!uptodate)) { - if (extent_len) { - endio_readpage_release_extent(tree, - extent_start, - extent_len, 1); - extent_start = 0; - extent_len = 0; - } - endio_readpage_release_extent(tree, start, - end - start + 1, 0); - } else if (!extent_len) { - extent_start = start; - extent_len = end + 1 - start; - } else if (extent_start + extent_len == start) { - extent_len += end + 1 - start; - } else { - endio_readpage_release_extent(tree, extent_start, - extent_len, uptodate); - extent_start = start; - extent_len = end + 1 - start; - } + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, uptodate); } - - if (extent_len) - endio_readpage_release_extent(tree, extent_start, extent_len, - uptodate); + /* Release the last extent */ + endio_readpage_release_extent(&processed, NULL, 0, 0, false); btrfs_io_bio_free_csum(io_bio); bio_put(bio); } -- cgit v1.2.3 From e09caaf913a9d3d7fc892c0b5a85e6b2ec3728dc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:29 +0800 Subject: btrfs: introduce helper to handle page status update in end_bio_extent_readpage() Introduce a new helper to handle update page status in end_bio_extent_readpage(). This will be later used for subpage support where the page status update can be more complex than now. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 519f74502a3e..4bc6beaff562 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2842,6 +2842,17 @@ update: processed->uptodate = uptodate; } +static void endio_readpage_update_page_status(struct page *page, bool uptodate) +{ + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2964,14 +2975,11 @@ readpage_ok: off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); } - unlock_page(page); offset += len; + /* Update page status and unlock */ + endio_readpage_update_page_status(page, uptodate); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } -- cgit v1.2.3 From f97e27e91d90243e1144030ea1dcbf4154872944 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:40 +0800 Subject: btrfs: use fixed width int type for extent_state::state Currently the type is unsigned int which could change its width depending on the architecture. We need up to 32 bits so make it explicit. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 33 +++++++++++++++--------------- fs/btrfs/extent_io.c | 52 +++++++++++++++++++++++------------------------ fs/btrfs/extent_io.h | 3 +-- 3 files changed, 42 insertions(+), 46 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 45e7f0703d5b..04083ee5ae6e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -87,7 +87,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; refcount_t refs; - unsigned state; + u32 state; struct io_failure_record *failrec; @@ -119,19 +119,18 @@ void __cold extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned bits, int contig); + u64 max_bytes, u32 bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, - struct extent_state *cached_state); + u32 bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); + u32 bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached, gfp_t mask, struct extent_changeset *changeset); @@ -155,7 +154,7 @@ static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) + u64 end, u32 bits) { int wake = 0; @@ -166,16 +165,16 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); + u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned exclusive_bits, u64 *failed_start, + u32 bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask, struct extent_changeset *changeset); int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits); + u32 bits); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) + u64 end, u32 bits) { return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, NULL); @@ -204,11 +203,11 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, + u32 bits, u32 clear_bits, struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, unsigned int extra_bits, + u64 end, u32 extra_bits, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, @@ -239,12 +238,12 @@ static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, + u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits); + u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits); + u64 *start_ret, u64 *end_ret, u32 bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4bc6beaff562..97c759998f38 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -142,7 +142,7 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, unsigned bits, +static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, int set) { @@ -530,7 +530,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits, + struct extent_state *state, u32 *bits, struct extent_changeset *changeset); /* @@ -547,7 +547,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -628,11 +628,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake, + u32 *bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -695,9 +695,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + u32 bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -868,7 +868,7 @@ static void wait_on_state(struct extent_io_tree *tree, * The tree lock is taken by this function */ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits) + u32 bits) { struct extent_state *state; struct rb_node *node; @@ -915,9 +915,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { - unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -961,8 +961,8 @@ static void cache_state(struct extent_state *state, * * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, - unsigned exclusive_bits, u64 *failed_start, +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + u32 exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask, struct extent_changeset *changeset) { @@ -1200,7 +1200,7 @@ out: * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, + u32 bits, u32 clear_bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1401,7 +1401,7 @@ out: /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1416,14 +1416,14 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits) + u32 bits) { return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOWAIT, NULL); } int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, @@ -1431,7 +1431,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1519,8 +1519,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) * nothing was found after 'start' */ static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned bits) +find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) { struct rb_node *node; struct extent_state *state; @@ -1555,7 +1554,7 @@ out: * Return 1 if we found nothing. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, + u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1606,7 +1605,7 @@ out: * returned will be the full contiguous area with the bits set. */ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; int ret = 1; @@ -1643,7 +1642,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, * trim @end_ret to the appropriate size. */ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct rb_node *node, *prev = NULL, *next; @@ -2014,8 +2013,7 @@ out: void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -2031,7 +2029,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned bits, int contig) + u32 bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -2151,7 +2149,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, struct extent_state *cached) + u32 bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cf95e9e4acb9..66762c3cdf81 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -255,8 +255,7 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned bits_to_clear, - unsigned long page_ops); + u32 bits_to_clear, unsigned long page_ops); struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); -- cgit v1.2.3 From 2c36395430b0443751cf78903e3c3a37cae0f8d1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:44 +0800 Subject: btrfs: scrub: remove the anonymous structure from scrub_page That anonymous structure serve no special purpose, just replace it with regular members. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 74acb7ca6eda..cb50e4cb97d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -71,11 +71,9 @@ struct scrub_page { u64 physical; u64 physical_for_dev_replace; atomic_t refs; - struct { - unsigned int mirror_num:8; - unsigned int have_csum:1; - unsigned int io_error:1; - }; + u8 mirror_num; + int have_csum:1; + int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; -- cgit v1.2.3 From 8e1dc982ed5fa52596504054a0713bf5acbe19f0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 12 Nov 2020 16:47:57 +0800 Subject: btrfs: remove unused parameter phy_offset from btrfs_validate_metadata_buffer Parameter @phy_offset is the offset against the bio->bi_iter.bi_sector. @phy_offset is mostly for data io to lookup the csum in btrfs_io_bio. But for metadata, it's completely useless as metadata stores their own csum in its header, so we can remove it. Note: parameters @start and @end, they are not utilized at all for current sectorsize == PAGE_SIZE case, as we can grab eb directly from page. But those two parameters are very important for later subpage support, thus @start/@len are not touched here. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent_io.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8c87a1caefff..37b2e0aad162 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -593,7 +593,7 @@ out: return ret; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e75ea6092942..40e81d6e481e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -79,7 +79,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror); blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 97c759998f38..67faa7797d89 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2919,7 +2919,7 @@ static void end_bio_extent_readpage(struct bio *bio) start, end, mirror); else ret = btrfs_validate_metadata_buffer(io_bio, - offset, page, start, end, mirror); + page, start, end, mirror); if (ret) uptodate = 0; else -- cgit v1.2.3 From 829ddec922e51ad2740f16646ce701314d9aa509 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:39 +0800 Subject: btrfs: only clear EXTENT_LOCK bit in extent_invalidatepage extent_invalidatepage() will try to clear all possible bits since it's calling clear_extent_bit() with delete == 1. This is currently fine, since for btree io tree, it only utilizes EXTENT_LOCK bit. But this could be a problem for later subpage support, which will utilize extra io tree bit to represent additional info. This patch will just convert that clear_extent_bit() to unlock_extent_cached(). For current code since only EXTENT_LOCKED bit is utilized, this doesn't change the behavior, but provides a much cleaner basis for incoming subpage support. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 67faa7797d89..cdccfdbbe3aa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4431,14 +4431,22 @@ int extent_invalidatepage(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; + /* This function is only called for the btree inode */ + ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); + start += ALIGN(offset, blocksize); if (start > end) return 0; lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); + + /* + * Currently for btree io tree, only EXTENT_LOCKED is utilized, + * so here we only need to unlock the extent range to free any + * existing extent state. + */ + unlock_extent_cached(tree, start, end, &cached_state); return 0; } -- cgit v1.2.3 From 35478d053ade437cc51c7e576108bef2fec32c1e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:41 +0800 Subject: btrfs: use nodesize to determine if we need readahead in btrfs_lookup_bio_sums In btrfs_lookup_bio_sums() if the bio is pretty large, we want to start readahead in the csum tree. However the threshold is an immediate number, (PAGE_SIZE * 8), from the initial btrfs merge. The meaning of the value is pretty hard to guess, especially when the immediate number is from the times when 4K sectorsize was the default and only CRC32C was supported. For the most common btrfs setup, CRC32 csum and 4K sectorsize, it means just 32K read would kick readahead, while the csum itself is only 32 bytes in size. Now let's be more reasonable by taking both csum size and node size into consideration. If the csum size for the bio is larger than one leaf, then we kick the readahead. This means for current default btrfs, the threshold will be 16M. This change should not change performance observably, thus this is mostly a readability enhancement. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1a5651bebbaf..6086e4cff203 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -297,7 +297,11 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum = dst; } - if (bio->bi_iter.bi_size > PAGE_SIZE * 8) + /* + * If requested number of sectors is larger than one leaf can contain, + * kick the readahead for csum tree. + */ + if (nblocks > fs_info->csums_per_leaf) path->reada = READA_FORWARD; /* -- cgit v1.2.3 From fb22e9c4cd57e67aa9d62c8bbde5192349dc584a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Nov 2020 20:51:49 +0800 Subject: btrfs: use detach_page_private() in alloc_extent_buffer() In alloc_extent_buffer(), after we got a page from btree inode, we check if that page has private pointer attached. If attached, we check if the existing extent buffer has proper refs. If not (the eb is being freed), we will detach that private eb pointer. The point here is, we are detaching that eb pointer by calling: - ClearPagePrivate() - put_page() The put_page() here is especially confusing, as it's decreasing the ref from attach_page_private(). Without knowing that, it looks like the put_page() is for the find_or_create_page() call, confusing the reader. Since we're always modifying page private with attach_page_private() and detach_page_private(), the only open-coded detach_page_private() here is really confusing. Fix it by calling detach_page_private(). Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdccfdbbe3aa..f514304b23bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5278,13 +5278,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } exists = NULL; - /* - * Do this so attach doesn't complain and we need to - * drop the ref the old guy had. - */ - ClearPagePrivate(p); WARN_ON(PageDirty(p)); - put_page(p); + detach_page_private(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); -- cgit v1.2.3 From a0f6d924cada10eefa526ccfa1be7888f559d9d7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 13 Nov 2020 17:58:03 +0100 Subject: btrfs: remove stub device info from messages when we have no fs_info Without a NULL fs_info the helpers will print something like BTRFS error (device ): ... This can happen in contexts where fs_info is not available at all or it's potentially unsafe due to object lifetime. The stub does not bring much information and with the prefix makes the message unnecessarily longer. Remove it for the NULL fs_info case. BTRFS error: ... Callers can add the device information to the message itself if needed. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/super.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6693cfc14dfd..348f8899f4f4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -240,9 +240,13 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.fmt = fmt; vaf.va = &args; - if (__ratelimit(ratelimit)) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info ? fs_info->sb->s_id : "", &vaf); + if (__ratelimit(ratelimit)) { + if (fs_info) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, + fs_info->sb->s_id, &vaf); + else + printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } va_end(args); } -- cgit v1.2.3 From c7c01a4a2524b3f130c1821fbaf1677fe8394165 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Nov 2020 16:12:45 +0100 Subject: btrfs: tree-checker: annotate all error branches as unlikely The tree checker is called many times as it verifies metadata at read/write time. The checks follow a simple pattern: if (error_condition) { report_error(); return -EUCLEAN; } All the error reporting functions are annotated as __cold that is supposed to hint the compiler to move the statement block out of the hot path. This does not seem to happen that often. As the error condition is expected to be false almost always, we can annotate it with 'unlikely' as this satisfies one of the few use cases for the annotation. The expected outcome is a stronger hint to compiler to reorder the checks test jump to exit test jump to exit ... which can be observed in asm of eg. check_dir_item, btrfs_check_chunk_valid, check_root_item or check_leaf. There's a measurable run time improvement reported by Josef, the testing workload went from 655 MiB/s to 677 MiB/s, which is about +3%. There should be no functional changes but some of the conditions have been rewritten to produce more readable result, some lines are longer than 80, for the sake of readability. Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 333 +++++++++++++++++++++++++----------------------- 1 file changed, 173 insertions(+), 160 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f3f666b343ef..028e733e42f3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -100,7 +100,8 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, */ #define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ ({ \ - if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ + if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)))) \ file_extent_err((leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ @@ -203,7 +204,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, u32 item_size = btrfs_item_size_nr(leaf, slot); u64 extent_end; - if (!IS_ALIGNED(key->offset, sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { file_extent_err(leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); @@ -216,7 +217,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, * But if objectids mismatch, it means we have a missing * INODE_ITEM. */ - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -225,14 +226,15 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Make sure the item contains at least inline header, so the file * extent type is not some garbage. */ - if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { + if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) { file_extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; } - if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) { + if (unlikely(btrfs_file_extent_type(leaf, fi) >= + BTRFS_NR_FILE_EXTENT_TYPES)) { file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), @@ -244,14 +246,15 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { + if (unlikely(btrfs_file_extent_compression(leaf, fi) >= + BTRFS_NR_COMPRESS_TYPES)) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } - if (btrfs_file_extent_encryption(leaf, fi)) { + if (unlikely(btrfs_file_extent_encryption(leaf, fi))) { file_extent_err(leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); @@ -259,7 +262,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ - if (key->offset) { + if (unlikely(key->offset)) { file_extent_err(leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); @@ -272,8 +275,8 @@ static int check_extent_data_item(struct extent_buffer *leaf, return 0; /* Uncompressed inline extent size must match item size */ - if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + - btrfs_file_extent_ram_bytes(leaf, fi)) { + if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi))) { file_extent_err(leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + @@ -284,22 +287,22 @@ static int check_extent_data_item(struct extent_buffer *leaf, } /* Regular or preallocated extent has fixed item size */ - if (item_size != sizeof(*fi)) { + if (unlikely(item_size != sizeof(*fi))) { file_extent_err(leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } - if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) + if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))) return -EUCLEAN; /* Catch extent end overflow */ - if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), - key->offset, &extent_end)) { + if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end))) { file_extent_err(leaf, slot, "extent end overflow, have file offset %llu extent num bytes %llu", key->offset, @@ -320,7 +323,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, prev_fi = btrfs_item_ptr(leaf, slot - 1, struct btrfs_file_extent_item); prev_end = file_extent_end(leaf, prev_key, prev_fi); - if (prev_end > key->offset) { + if (unlikely(prev_end > key->offset)) { file_extent_err(leaf, slot - 1, "file extent end range (%llu) goes beyond start offset (%llu) of the next file extent", prev_end, key->offset); @@ -338,19 +341,19 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, u32 sectorsize = fs_info->sectorsize; const u32 csumsize = fs_info->csum_size; - if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) { generic_err(leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } - if (!IS_ALIGNED(key->offset, sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { generic_err(leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } - if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); @@ -363,7 +366,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, prev_item_size = btrfs_item_size_nr(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; - if (prev_csum_end > key->offset) { + if (unlikely(prev_csum_end > key->offset)) { generic_err(leaf, slot - 1, "csum end range (%llu) goes beyond the start range (%llu) of the next csum item", prev_csum_end, key->offset); @@ -388,15 +391,16 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, /* For XATTR_ITEM, location key should be all 0 */ if (item_key.type == BTRFS_XATTR_ITEM_KEY) { - if (key->type != 0 || key->objectid != 0 || key->offset != 0) + if (unlikely(key->objectid != 0 || key->type != 0 || + key->offset != 0)) return -EUCLEAN; return 0; } - if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || - key->objectid > BTRFS_LAST_FREE_OBJECTID) && - key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && - key->objectid != BTRFS_FREE_INO_OBJECTID) { + if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID)) { if (is_inode_item) { generic_err(leaf, slot, "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", @@ -414,7 +418,7 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, } return -EUCLEAN; } - if (key->offset != 0) { + if (unlikely(key->offset != 0)) { if (is_inode_item) inode_item_err(leaf, slot, "invalid key offset: has %llu expect 0", @@ -438,7 +442,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); /* No such tree id */ - if (key->objectid == 0) { + if (unlikely(key->objectid == 0)) { if (is_root_item) generic_err(leaf, slot, "invalid root id 0"); else @@ -448,7 +452,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ - if (!is_fstree(key->objectid) && !is_root_item) { + if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, @@ -464,7 +468,8 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, * So here we only check offset for reloc tree whose key->offset must * be a valid tree. */ - if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID && + key->offset == 0)) { generic_err(leaf, slot, "invalid root id 0 for reloc tree"); return -EUCLEAN; } @@ -480,8 +485,9 @@ static int check_dir_item(struct extent_buffer *leaf, u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { struct btrfs_key location_key; @@ -494,7 +500,7 @@ static int check_dir_item(struct extent_buffer *leaf, int ret; /* header itself should not cross item boundary */ - if (cur + sizeof(*di) > item_size) { + if (unlikely(cur + sizeof(*di) > item_size)) { dir_item_err(leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); @@ -505,12 +511,12 @@ static int check_dir_item(struct extent_buffer *leaf, btrfs_dir_item_key_to_cpu(leaf, di, &location_key); if (location_key.type == BTRFS_ROOT_ITEM_KEY) { ret = check_root_key(leaf, &location_key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } else if (location_key.type == BTRFS_INODE_ITEM_KEY || location_key.type == 0) { ret = check_inode_key(leaf, &location_key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } else { dir_item_err(leaf, slot, @@ -522,22 +528,22 @@ static int check_dir_item(struct extent_buffer *leaf, /* dir type check */ dir_type = btrfs_dir_type(leaf, di); - if (dir_type >= BTRFS_FT_MAX) { + if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } - if (key->type == BTRFS_XATTR_ITEM_KEY && - dir_type != BTRFS_FT_XATTR) { + if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } - if (dir_type == BTRFS_FT_XATTR && - key->type != BTRFS_XATTR_ITEM_KEY) { + if (unlikely(dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY)) { dir_item_err(leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; @@ -550,13 +556,13 @@ static int check_dir_item(struct extent_buffer *leaf, /* Name/data length check */ name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); - if (name_len > max_name_len) { + if (unlikely(name_len > max_name_len)) { dir_item_err(leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } - if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { + if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) { dir_item_err(leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, @@ -564,7 +570,7 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (data_len && dir_type != BTRFS_FT_XATTR) { + if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); @@ -574,7 +580,7 @@ static int check_dir_item(struct extent_buffer *leaf, total_size = sizeof(*di) + name_len + data_len; /* header and name/data should not cross item boundary */ - if (cur + total_size > item_size) { + if (unlikely(cur + total_size > item_size)) { dir_item_err(leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); @@ -592,7 +598,7 @@ static int check_dir_item(struct extent_buffer *leaf, read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); - if (key->offset != name_hash) { + if (unlikely(key->offset != name_hash)) { dir_item_err(leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); @@ -641,13 +647,13 @@ static int check_block_group_item(struct extent_buffer *leaf, * Here we don't really care about alignment since extent allocator can * handle it. We care more about the size. */ - if (key->offset == 0) { + if (unlikely(key->offset == 0)) { block_group_err(leaf, slot, "invalid block group size 0"); return -EUCLEAN; } - if (item_size != sizeof(bgi)) { + if (unlikely(item_size != sizeof(bgi))) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); @@ -656,8 +662,8 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID) { + if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -665,7 +671,7 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (btrfs_stack_block_group_used(&bgi) > key->offset) { + if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) { block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", btrfs_stack_block_group_used(&bgi), key->offset); @@ -673,7 +679,7 @@ static int check_block_group_item(struct extent_buffer *leaf, } flags = btrfs_stack_block_group_flags(&bgi); - if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { + if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) { block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, @@ -682,11 +688,11 @@ static int check_block_group_item(struct extent_buffer *leaf, } type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (type != BTRFS_BLOCK_GROUP_DATA && - type != BTRFS_BLOCK_GROUP_METADATA && - type != BTRFS_BLOCK_GROUP_SYSTEM && - type != (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_DATA)) { + if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && + type != BTRFS_BLOCK_GROUP_METADATA && + type != BTRFS_BLOCK_GROUP_SYSTEM && + type != (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), @@ -773,49 +779,49 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; - if (!num_stripes) { + if (unlikely(!num_stripes)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } - if (num_stripes < ncopies) { + if (unlikely(num_stripes < ncopies)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } - if (nparity && num_stripes == nparity) { + if (unlikely(nparity && num_stripes == nparity)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { chunk_err(leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", logical, fs_info->sectorsize); return -EUCLEAN; } - if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { chunk_err(leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", btrfs_chunk_sector_size(leaf, chunk), fs_info->sectorsize); return -EUCLEAN; } - if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { chunk_err(leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } - if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { chunk_err(leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; } - if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - type) { + if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK))) { chunk_err(leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | @@ -824,22 +830,23 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return -EUCLEAN; } - if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && - (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { chunk_err(leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } - if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { chunk_err(leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; } - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && - (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA)))) { chunk_err(leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); @@ -851,20 +858,21 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, mixed = true; if (!mixed) { - if ((type & BTRFS_BLOCK_GROUP_METADATA) && - (type & BTRFS_BLOCK_GROUP_DATA)) { + if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA))) { chunk_err(leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } } - if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { + if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1))) { chunk_err(leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -887,7 +895,7 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, { int num_stripes; - if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) { + if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size_nr(leaf, slot), @@ -901,8 +909,8 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (num_stripes == 0) goto out; - if (btrfs_chunk_item_size(num_stripes) != - btrfs_item_size_nr(leaf, slot)) { + if (unlikely(btrfs_chunk_item_size(num_stripes) != + btrfs_item_size_nr(leaf, slot))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size_nr(leaf, slot), @@ -941,14 +949,14 @@ static int check_dev_item(struct extent_buffer *leaf, { struct btrfs_dev_item *ditem; - if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, "invalid objectid: has=%llu expect=%llu", key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - if (btrfs_device_id(leaf, ditem) != key->offset) { + if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, "devid mismatch: key has=%llu item has=%llu", key->offset, btrfs_device_id(leaf, ditem)); @@ -960,8 +968,8 @@ static int check_dev_item(struct extent_buffer *leaf, * it can be 0 for device removal. Device size check can only be done * by dev extents check. */ - if (btrfs_device_bytes_used(leaf, ditem) > - btrfs_device_total_bytes(leaf, ditem)) { + if (unlikely(btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem))) { dev_item_err(leaf, slot, "invalid bytes used: have %llu expect [0, %llu]", btrfs_device_bytes_used(leaf, ditem), @@ -986,13 +994,13 @@ static int check_inode_item(struct extent_buffer *leaf, int ret; ret = check_inode_key(leaf, key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ - if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { + if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), @@ -1000,7 +1008,7 @@ static int check_inode_item(struct extent_buffer *leaf, return -EUCLEAN; } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ - if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); @@ -1013,7 +1021,7 @@ static int check_inode_item(struct extent_buffer *leaf, * anything in the fs. So here we skip the check. */ mode = btrfs_inode_mode(leaf, iitem); - if (mode & ~valid_mask) { + if (unlikely(mode & ~valid_mask)) { inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); @@ -1026,20 +1034,20 @@ static int check_inode_item(struct extent_buffer *leaf, * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS */ if (!has_single_bit_set(mode & S_IFMT)) { - if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { + if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) { inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } - if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { + if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) { inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } - if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { + if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & @@ -1059,11 +1067,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int ret; ret = check_root_key(leaf, key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && - btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { + if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != + btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", btrfs_item_size_nr(leaf, slot), sizeof(ri), @@ -1080,24 +1089,24 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot)); /* Generation related */ - if (btrfs_root_generation(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root generation, have %llu expect (0, %llu]", btrfs_root_generation(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (btrfs_root_generation_v2(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root v2 generation, have %llu expect (0, %llu]", btrfs_root_generation_v2(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (btrfs_root_last_snapshot(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root last_snapshot, have %llu expect (0, %llu]", btrfs_root_last_snapshot(&ri), @@ -1106,19 +1115,19 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, } /* Alignment and level check */ - if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) { generic_err(leaf, slot, "invalid root bytenr, have %llu expect to be aligned to %u", btrfs_root_bytenr(&ri), fs_info->sectorsize); return -EUCLEAN; } - if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } - if (btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1); @@ -1126,7 +1135,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, } /* Flags check */ - if (btrfs_root_flags(&ri) & ~valid_root_flags) { + if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) { generic_err(leaf, slot, "invalid root flags, have 0x%llx expect mask 0x%llx", btrfs_root_flags(&ri), valid_root_flags); @@ -1180,14 +1189,14 @@ static int check_extent_item(struct extent_buffer *leaf, u64 total_refs; /* Total refs in btrfs_extent_item */ u64 inline_refs = 0; /* found total inline refs */ - if (key->type == BTRFS_METADATA_ITEM_KEY && - !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) { generic_err(leaf, slot, "invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); return -EUCLEAN; } /* key->objectid is the bytenr for both key types */ - if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid, have %llu expect to be aligned to %u", key->objectid, fs_info->sectorsize); @@ -1195,8 +1204,8 @@ static int check_extent_item(struct extent_buffer *leaf, } /* key->offset is tree level for METADATA_ITEM_KEY */ - if (key->type == BTRFS_METADATA_ITEM_KEY && - key->offset >= BTRFS_MAX_LEVEL) { + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree level, have %llu expect [0, %u]", key->offset, BTRFS_MAX_LEVEL - 1); @@ -1222,7 +1231,7 @@ static int check_extent_item(struct extent_buffer *leaf, * Either using btrfs_extent_inline_ref::offset, or specific * data structure. */ - if (item_size < sizeof(*ei)) { + if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, sizeof(*ei), @@ -1236,15 +1245,16 @@ static int check_extent_item(struct extent_buffer *leaf, flags = btrfs_extent_flags(leaf, ei); total_refs = btrfs_extent_refs(leaf, ei); generation = btrfs_extent_generation(leaf, ei); - if (generation > btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(generation > + btrfs_super_generation(fs_info->super_copy) + 1)) { extent_err(leaf, slot, "invalid generation, have %llu expect (0, %llu]", generation, btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | - BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK)))) { extent_err(leaf, slot, "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", flags, BTRFS_EXTENT_FLAG_DATA | @@ -1253,21 +1263,21 @@ static int check_extent_item(struct extent_buffer *leaf, } is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); if (is_tree_block) { - if (key->type == BTRFS_EXTENT_ITEM_KEY && - key->offset != fs_info->nodesize) { + if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize)) { extent_err(leaf, slot, "invalid extent length, have %llu expect %u", key->offset, fs_info->nodesize); return -EUCLEAN; } } else { - if (key->type != BTRFS_EXTENT_ITEM_KEY) { + if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) { extent_err(leaf, slot, "invalid key type, have %u expect %u for data backref", key->type, BTRFS_EXTENT_ITEM_KEY); return -EUCLEAN; } - if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent length, have %llu expect aligned to %u", key->offset, fs_info->sectorsize); @@ -1281,7 +1291,7 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; - if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree block info level, have %u expect [0, %u]", btrfs_tree_block_level(leaf, info), @@ -1300,7 +1310,7 @@ static int check_extent_item(struct extent_buffer *leaf, u64 inline_offset; u8 inline_type; - if (ptr + sizeof(*iref) > end) { + if (unlikely(ptr + sizeof(*iref) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %zu end %lu", ptr, sizeof(*iref), end); @@ -1309,7 +1319,7 @@ static int check_extent_item(struct extent_buffer *leaf, iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); - if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { + if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", ptr, inline_type, end); @@ -1323,7 +1333,8 @@ static int check_extent_item(struct extent_buffer *leaf, break; /* Contains parent bytenr */ case BTRFS_SHARED_BLOCK_REF_KEY: - if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); @@ -1338,7 +1349,8 @@ static int check_extent_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); - if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(dref_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data ref offset, have %llu expect aligned to %u", dref_offset, fs_info->sectorsize); @@ -1349,7 +1361,8 @@ static int check_extent_item(struct extent_buffer *leaf, /* Contains parent bytenr and ref count */ case BTRFS_SHARED_DATA_REF_KEY: sref = (struct btrfs_shared_data_ref *)(iref + 1); - if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); @@ -1365,14 +1378,14 @@ static int check_extent_item(struct extent_buffer *leaf, ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ - if (ptr != end) { + if (unlikely(ptr != end)) { extent_err(leaf, slot, "invalid extent item size, padding bytes found"); return -EUCLEAN; } /* Finally, check the inline refs against total refs */ - if (inline_refs > total_refs) { + if (unlikely(inline_refs > total_refs)) { extent_err(leaf, slot, "invalid extent refs, have %llu expect >= inline %llu", total_refs, inline_refs); @@ -1389,21 +1402,21 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, if (key->type == BTRFS_SHARED_DATA_REF_KEY) expect_item_size = sizeof(struct btrfs_shared_data_ref); - if (btrfs_item_size_nr(leaf, slot) != expect_item_size) { + if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", btrfs_item_size_nr(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } - if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); return -EUCLEAN; } - if (key->type != BTRFS_TREE_BLOCK_REF_KEY && - !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) { + if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", key->offset, leaf->fs_info->sectorsize); @@ -1419,14 +1432,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf, unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); - if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) { + if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", btrfs_item_size_nr(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } - if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); @@ -1443,13 +1456,13 @@ static int check_extent_data_ref(struct extent_buffer *leaf, owner = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); hash = hash_extent_data_ref(root_objectid, owner, offset); - if (hash != key->offset) { + if (unlikely(hash != key->offset)) { extent_err(leaf, slot, "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", hash, key->offset); return -EUCLEAN; } - if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", offset, leaf->fs_info->sectorsize); @@ -1469,10 +1482,10 @@ static int check_inode_ref(struct extent_buffer *leaf, unsigned long ptr; unsigned long end; - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ - if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { + if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size_nr(leaf, slot), @@ -1485,7 +1498,7 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (ptr + sizeof(iref) > end) { + if (unlikely(ptr + sizeof(iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); @@ -1494,7 +1507,7 @@ static int check_inode_ref(struct extent_buffer *leaf, iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); - if (ptr + sizeof(*iref) + namelen > end) { + if (unlikely(ptr + sizeof(*iref) + namelen > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); @@ -1577,7 +1590,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) u32 nritems = btrfs_header_nritems(leaf); int slot; - if (btrfs_header_level(leaf) != 0) { + if (unlikely(btrfs_header_level(leaf) != 0)) { generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); @@ -1596,19 +1609,19 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) u64 owner = btrfs_header_owner(leaf); /* These trees must never be empty */ - if (owner == BTRFS_ROOT_TREE_OBJECTID || - owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || - owner == BTRFS_DEV_TREE_OBJECTID || - owner == BTRFS_FS_TREE_OBJECTID || - owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || + owner == BTRFS_CHUNK_TREE_OBJECTID || + owner == BTRFS_EXTENT_TREE_OBJECTID || + owner == BTRFS_DEV_TREE_OBJECTID || + owner == BTRFS_FS_TREE_OBJECTID || + owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return -EUCLEAN; } /* Unknown tree */ - if (owner == 0) { + if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; @@ -1616,7 +1629,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return 0; } - if (nritems == 0) + if (unlikely(nritems == 0)) return 0; /* @@ -1637,7 +1650,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) btrfs_item_key_to_cpu(leaf, &key, slot); /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, @@ -1656,7 +1669,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); - if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), @@ -1669,8 +1682,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (unlikely(btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), @@ -1679,8 +1692,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) } /* Also check if the item pointer overlaps with btrfs item. */ - if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > - btrfs_item_ptr_offset(leaf, slot)) { + if (unlikely(btrfs_item_ptr_offset(leaf, slot) < + btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + @@ -1695,7 +1708,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * criteria */ ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } @@ -1728,13 +1741,13 @@ int btrfs_check_node(struct extent_buffer *node) u64 bytenr; int ret = 0; - if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { + if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", btrfs_header_owner(node), node->start, @@ -1748,13 +1761,13 @@ int btrfs_check_node(struct extent_buffer *node) btrfs_node_key_to_cpu(node, &key, slot); btrfs_node_key_to_cpu(node, &next_key, slot + 1); - if (!bytenr) { + if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } - if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); @@ -1762,7 +1775,7 @@ int btrfs_check_node(struct extent_buffer *node) goto out; } - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, -- cgit v1.2.3 From c65ca98f9e687196a840bd8b71d28d32ffe91170 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Nov 2020 11:00:17 +0000 Subject: btrfs: unlock path before checking if extent is shared during nocow writeback When we are attempting to start writeback for an existing extent in NOCOW mode, at run_delalloc_nocow(), we must check if the extent is shared, and if it is, fallback to a COW write. However we do such check while still holding a read lock on the leaf that contains the file extent item, and that check, the call to btrfs_cross_ref_exist(), can take some time because: 1) It needs to do a search on the extent tree, which obviously takes some time, specially if delayed references are being run at the moment, as we can block when trying to lock currently write locked btree nodes; 2) It needs to check the delayed references for any existing reference for our data extent, this requires acquiring the delayed references' spinlock and maybe block on the mutex of a delayed reference head in the case where there is a delayed reference for our data extent, in the worst case it makes us release the path on the extent tree and retry the whole process again (going back to step 1). There are other operations we do while holding the leaf locked that can take some significant time as well (specially all together): * btrfs_extent_readonly() - to check if the block group containing the extent is currently in RO mode. This requires taking a spinlock and searching for the block group in a rbtree that can be big on large filesystems; * csum_exist_in_range() - to search if there are any checksums in the csum tree for the extent. Like before, this can take some time if we are in a filesystem that has both COW and NOCOW files, in which case the csum tree is not empty; * btrfs_inc_nocow_writers() - increment the number of nocow writers in the block group that contains the data extent. Needs to acquire a spinlock and search for the block group in a rbtree that can be big on large filesystems. So just unlock the leaf (release the path) before doing all those checks, since we do not need it anymore. In case we can not do a NOCOW write for the extent, due to any of those checks failing, and the writeback range goes beyond that extents' length, we will do another btree search for the next file extent item. The following script that calls dbench was used to measure the impact of this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device directly (no intermediary filesystem on the host) and using a non-debug kernel (default configuration on Debian): $ cat test-dbench.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd -o nodatacow" MKFS_OPTIONS="-m single -d single" mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT Before this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9326331 0.317 399.957 Close 6851198 0.002 6.402 Rename 394894 2.621 402.819 Unlink 1883131 0.931 398.082 Deltree 256 19.160 303.580 Mkdir 128 0.003 0.016 Qpathinfo 8452314 0.068 116.133 Qfileinfo 1481921 0.001 5.081 Qfsinfo 1549963 0.002 4.444 Sfileinfo 759679 0.084 17.079 Find 3268168 0.396 118.196 WriteX 4653310 0.056 110.993 ReadX 14618818 0.005 23.314 LockX 30364 0.003 0.497 UnlockX 30364 0.002 1.720 Flush 653619 16.954 569.299 Throughput 966.651 MB/sec 64 clients 64 procs max_latency=569.377 ms After this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9710433 0.302 232.449 Close 7132948 0.002 11.496 Rename 411144 2.452 131.805 Unlink 1960961 0.893 230.383 Deltree 256 14.858 198.646 Mkdir 128 0.002 0.005 Qpathinfo 8800890 0.066 111.588 Qfileinfo 1542556 0.001 3.852 Qfsinfo 1613835 0.002 5.483 Sfileinfo 790871 0.081 19.492 Find 3402743 0.386 120.185 WriteX 4842918 0.054 179.312 ReadX 15220407 0.005 32.435 LockX 31612 0.003 1.533 UnlockX 31612 0.002 1.047 Flush 680567 16.320 463.323 Throughput 1016.59 MB/sec 64 clients 64 procs max_latency=463.327 ms +5.0% throughput, -20.5% max latency Also, the following test using fio was run: $ cat test-fio.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd -o nodatacow" MKFS_OPTIONS="-d single -m single" if [ $# -ne 4 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 BLOCK_SIZE=$4 cat < /tmp/fio-job.ini [writers] rw=randwrite fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=$BLOCK_SIZE ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS EOF echo echo "Using fio config:" echo cat /tmp/fio-job.ini echo echo "mount options: $MOUNT_OPTIONS" echo mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null mount $MOUNT_OPTIONS $DEV $MNT echo "Creating nodatacow files before fio runs..." for ((i = 0; i < $NUM_JOBS; i++)); do xfs_io -f -c "pwrite -b 128M 0 $FILE_SIZE" "$MNT/writers.$i.0" done sync fio /tmp/fio-job.ini umount $MNT Before this change: $ ./test-fio.sh 16 512M 2 4K (...) WRITE: bw=28.3MiB/s (29.6MB/s), 28.3MiB/s-28.3MiB/s (29.6MB/s-29.6MB/s), io=8192MiB (8590MB), run=289800-289800msec After this change: $ ./test-fio.sh 16 512M 2 4K (...) WRITE: bw=31.2MiB/s (32.7MB/s), 31.2MiB/s-31.2MiB/s (32.7MB/s-32.7MB/s), io=8192MiB (8590MB), run=262845-262845msec +9.7% throughput, -9.8% runtime Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index abc0fd162f6c..dda181098b4c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1649,6 +1649,15 @@ next_slot: goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + + /* + * The following checks can be expensive, as they need to + * take other locks and do btree or rbtree searches, so + * release the path to avoid blocking other tasks for too + * long. + */ + btrfs_release_path(path); + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; @@ -1724,12 +1733,12 @@ out_check: cur_offset = extent_end; if (cur_offset > end) break; + if (!path->nodes[0]) + continue; path->slots[0]++; goto next_slot; } - btrfs_release_path(path); - /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be -- cgit v1.2.3 From ee0d904fd9c5662c58a737c77384f8959fdc8d12 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 24 Nov 2020 17:49:30 +0200 Subject: btrfs: remove err variable from btrfs_delete_subvolume Use only a single 'ret' to control whether we should abort the transaction or not. That's fine, because if we abort a transaction then btrfs_end_transaction will return the same value as passed to btrfs_abort_transaction. No semantic changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dda181098b4c..347076f1da6e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4107,7 +4107,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) struct btrfs_block_rsv block_rsv; u64 root_flags; int ret; - int err; /* * Don't allow to delete a subvolume with send in progress. This is @@ -4129,8 +4128,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) down_write(&fs_info->subvol_sem); - err = may_destroy_subvol(dest); - if (err) + ret = may_destroy_subvol(dest); + if (ret) goto out_up_write; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -4139,13 +4138,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) * two for dir entries, * two for root ref/backref. */ - err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); - if (err) + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); + if (ret) goto out_up_write; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_release; } trans->block_rsv = &block_rsv; @@ -4155,7 +4154,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { - err = ret; btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4173,7 +4171,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } } @@ -4183,7 +4180,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { @@ -4193,7 +4189,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } } @@ -4204,14 +4199,12 @@ out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans); - if (ret && !err) - err = ret; inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); - if (err) { + if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, @@ -4229,7 +4222,7 @@ out_up_write: } } - return err; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) -- cgit v1.2.3 From c6a592f2e2093e6a90d651b073746c7950d6420d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 24 Nov 2020 17:49:31 +0200 Subject: btrfs: eliminate err variable from merge_reloc_root In most cases when an error is returned from a function 'ret' is simply assigned to 'err'. There is only one case where walk_up_reloc_tree can return a positive value - in this case the code breaks from the loop and ret is going to get its return value from btrfs_cow_block - either 0 or negative. This retains the old logic of how 'err' used to be set at this call site. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c5774a8e6ff7..8fc75db901c8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1630,8 +1630,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, int level; int max_level; int replaced = 0; - int ret; - int err = 0; + int ret = 0; u32 min_reserved; path = btrfs_alloc_path(); @@ -1682,13 +1681,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); - if (ret) { - err = ret; + if (ret) goto out; - } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto out; } @@ -1710,10 +1707,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) break; @@ -1724,11 +1719,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } - if (ret > 0) { level = ret; btrfs_node_key_to_cpu(path->nodes[level], &key, @@ -1767,12 +1759,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); - if (ret < 0) - err = ret; out: btrfs_free_path(path); - if (err == 0) + if (ret == 0) insert_dirty_subvol(trans, rc, root); if (trans) @@ -1783,7 +1773,7 @@ out: if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); - return err; + return ret; } static noinline_for_stack -- cgit v1.2.3 From 8df01fddb77998a46b1b59563e1c5d094dc2586a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 24 Nov 2020 17:49:32 +0200 Subject: btrfs: remove err variable from do_relocation It simply gets assigned to 'ret' in case of errors. The flow of the while loop is not changed by this commit since the few call sites that 'goto next' will simply break from the loop. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8fc75db901c8..6e7e5fe2e277 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2181,8 +2181,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, u32 blocksize; u64 bytenr; int slot; - int ret; - int err = 0; + int ret = 0; BUG_ON(lowest && node->eb); @@ -2200,10 +2199,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { ret = btrfs_bin_search(upper->eb, key, &slot); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) @@ -2215,10 +2212,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret) { - if (ret < 0) - err = ret; - else - err = -ENOENT; + if (ret > 0) + ret = -ENOENT; btrfs_release_path(path); break; @@ -2238,10 +2233,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_bin_search(upper->eb, key, &slot); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(ret); } @@ -2252,7 +2245,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, upper->eb->start); - err = -EIO; + ret = -EIO; goto next; } } else { @@ -2263,7 +2256,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->fs_info->nodesize; eb = btrfs_read_node_slot(upper->eb, slot); if (IS_ERR(eb)) { - err = PTR_ERR(eb); + ret = PTR_ERR(eb); goto next; } btrfs_tree_lock(eb); @@ -2273,10 +2266,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, @@ -2302,19 +2293,19 @@ next: btrfs_backref_drop_node_buffer(upper); else btrfs_backref_unlock_node_buffer(upper); - if (err) + if (ret) break; } - if (!err && node->pending) { + if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); list_move_tail(&node->list, &rc->backref_cache.changed); node->pending = 0; } path->lowest_level = 0; - BUG_ON(err == -ENOSPC); - return err; + BUG_ON(ret == -ENOSPC); + return ret; } static int link_to_upper(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 8a8f4deaba79ca4cc0ae01f9a3b8f5594810de5e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 24 Nov 2020 16:49:08 +0200 Subject: btrfs: return bool from should_end_transaction Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0e4063651047..10338363754e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -898,12 +898,12 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info) wait_current_trans(fs_info); } -static int should_end_transaction(struct btrfs_trans_handle *trans) +static bool should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (btrfs_check_space_for_delayed_refs(fs_info)) - return 1; + return true; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); } -- cgit v1.2.3 From a2633b6a29e9c916ef1a0389826a2e4ebbe6b259 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 24 Nov 2020 16:49:25 +0200 Subject: btrfs: return bool from btrfs_should_end_transaction Results in slightly smaller code. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-11 (-11) Function old new delta btrfs_should_end_transaction 96 85 -11 Total: Before=20070, After=20059, chg -0.05% Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 4 ++-- fs/btrfs/transaction.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 10338363754e..238e5a2c0696 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -908,14 +908,14 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans) return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); } -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || cur_trans->delayed_refs.flushing) - return 1; + return true; return should_end_transaction(trans); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8241c050ba71..31ca81bad822 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -218,7 +218,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.2.3 From 7b3d5a90cbb9bc6a48c82fd7c146d24d6fceb0fa Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:06 +0900 Subject: btrfs: introduce ZONED feature flag This patch introduces the ZONED incompat flag. The flag indicates that the volume management will satisfy the constraints imposed by host-managed zoned block devices (aligned chunk allocation, append-only updates, reset zone after filled). As the zoned support will happen incrementally due to enhancing some core infrastructure like super block writes, tree-log, raid support, the feature will appear in sysfs only on debug builds. It will be enabled once the support is feature complete and applications can reliably check whether zoned support is present or not. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 7 +++++++ include/uapi/linux/btrfs.h | 1 + 2 files changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 293338153c20..4522a1c4cd08 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -263,6 +263,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BTRFS_DEBUG +BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -278,6 +282,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_FEAT_ATTR_PTR(zoned), +#endif NULL }; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 2c39d15a2beb..5df73001aad4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -307,6 +307,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) #define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) +#define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12) struct btrfs_ioctl_feature_flags { __u64 compat_flags; -- cgit v1.2.3 From 5b316468983dfa9473ff0f1c42e4e30b4c267141 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:07 +0900 Subject: btrfs: get zone information of zoned block devices If a zoned block device is found, get its zone information (number of zones and zone size). To avoid costly run-time zone report commands to test the device zones type during block allocation, attach the seq_zones bitmap to the device structure to indicate if a zone is sequential or accept random writes. Also it attaches the empty_zones bitmap to indicate if a zone is empty or not. This patch also introduces the helper function btrfs_dev_is_sequential() to test if the zone storing a block is a sequential write required zone and btrfs_dev_is_empty_zone() to test if the zone is a empty zone. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Damien Le Moal Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 1 + fs/btrfs/dev-replace.c | 5 ++ fs/btrfs/super.c | 5 ++ fs/btrfs/volumes.c | 20 +++++- fs/btrfs/volumes.h | 4 ++ fs/btrfs/zoned.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 87 +++++++++++++++++++++++++ 7 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 fs/btrfs/zoned.c create mode 100644 fs/btrfs/zoned.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e738f6206ea5..0497fdc37f90 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,6 +16,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dc3481014f16..71a1bddcbc76 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -21,6 +21,7 @@ #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" +#include "zoned.h" /* * Device replace overview @@ -313,6 +314,10 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error; + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 348f8899f4f4..3c39e363aec5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2513,6 +2513,11 @@ static void __init btrfs_print_mod_info(void) #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ", zoned=yes" +#else + ", zoned=no" #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7132af058a95..f250bf4dc91f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -31,6 +31,7 @@ #include "space-info.h" #include "block-group.h" #include "discard.h" +#include "zoned.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device) rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); + btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -667,6 +669,10 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; + ret = btrfs_get_dev_zone_info(device); + if (ret != 0) + goto error_free_page; + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1137,6 +1143,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->bdev = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); @@ -2541,10 +2548,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_assign_pointer(device->name, name); + device->fs_info = fs_info; + device->bdev = bdev; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error_free_device; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto error_free_device; + goto error_free_zone; } q = bdev_get_queue(bdev); @@ -2557,8 +2571,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; - device->fs_info = fs_info; - device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; @@ -2705,6 +2717,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); +error_free_zone: + btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5e08274d1252..1997a4649a66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -52,6 +52,8 @@ struct btrfs_io_geometry { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +struct btrfs_zoned_device_info; + struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ struct list_head dev_alloc_list; /* chunk mutex */ @@ -65,6 +67,8 @@ struct btrfs_device { struct block_device *bdev; + struct btrfs_zoned_device_info *zone_info; + /* the mode sent to blkdev_get */ fmode_t mode; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c new file mode 100644 index 000000000000..38d7bf45b83f --- /dev/null +++ b/fs/btrfs/zoned.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "zoned.h" +#include "rcu-string.h" + +/* Maximum number of zones to report per blkdev_report_zones() call */ +#define BTRFS_REPORT_NR_ZONES 4096 + +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct blk_zone *zones = data; + + memcpy(&zones[idx], zone, sizeof(*zone)); + + return 0; +} + +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) +{ + int ret; + + if (!*nr_zones) + return 0; + + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, + copy_zone_info_cb, zones); + if (ret < 0) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read zone %llu on %s (devid %llu)", + pos, rcu_str_deref(device->name), + device->devid); + return ret; + } + *nr_zones = ret; + if (!ret) + return -EIO; + + return 0; +} + +int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = NULL; + struct block_device *bdev = device->bdev; + sector_t nr_sectors; + sector_t sector = 0; + struct blk_zone *zones = NULL; + unsigned int i, nreported = 0, nr_zones; + unsigned int zone_sectors; + int ret; + + if (!bdev_is_zoned(bdev)) + return 0; + + if (device->zone_info) + return 0; + + zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return -ENOMEM; + + nr_sectors = bdev->bd_part->nr_sects; + zone_sectors = bdev_zone_sectors(bdev); + /* Check if it's power of 2 (see is_power_of_2) */ + ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) { + ret = -ENOMEM; + goto out; + } + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) { + ret = -ENOMEM; + goto out; + } + + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + ret = -ENOMEM; + goto out; + } + + /* Get zones type */ + while (sector < nr_sectors) { + nr_zones = BTRFS_REPORT_NR_ZONES; + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, + &nr_zones); + if (ret) + goto out; + + for (i = 0; i < nr_zones; i++) { + if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) + __set_bit(nreported, zone_info->seq_zones); + if (zones[i].cond == BLK_ZONE_COND_EMPTY) + __set_bit(nreported, zone_info->empty_zones); + nreported++; + } + sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; + } + + if (nreported != zone_info->nr_zones) { + btrfs_err_in_rcu(device->fs_info, + "inconsistent number of zones on %s (%u/%u)", + rcu_str_deref(device->name), nreported, + zone_info->nr_zones); + ret = -EIO; + goto out; + } + + kfree(zones); + + device->zone_info = zone_info; + + /* device->fs_info is not safe to use for printing messages */ + btrfs_info_in_rcu(NULL, + "host-%s zoned block device %s, %u zones of %llu bytes", + bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", + rcu_str_deref(device->name), zone_info->nr_zones, + zone_info->zone_size); + + return 0; + +out: + kfree(zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->seq_zones); + kfree(zone_info); + + return ret; +} + +void btrfs_destroy_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return; + + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + kfree(zone_info); + device->zone_info = NULL; +} + +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + unsigned int nr_zones = 1; + int ret; + + ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); + if (ret != 0 || !nr_zones) + return ret ? ret : -EIO; + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h new file mode 100644 index 000000000000..00c871f09d03 --- /dev/null +++ b/fs/btrfs/zoned.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ZONED_H +#define BTRFS_ZONED_H + +#include + +struct btrfs_zoned_device_info { + /* + * Number of zones, zone size and types of zones if bdev is a + * zoned block device. + */ + u64 zone_size; + u8 zone_size_shift; + u32 nr_zones; + unsigned long *seq_zones; + unsigned long *empty_zones; +}; + +#ifdef CONFIG_BLK_DEV_ZONED +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone); +int btrfs_get_dev_zone_info(struct btrfs_device *device); +void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + return 0; +} + +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + return 0; +} + +static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } + +#endif + +static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return false; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones); +} + +static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return true; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device, + u64 pos, bool set) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno; + + if (!zone_info) + return; + + zno = pos >> zone_info->zone_size_shift; + if (set) + set_bit(zno, zone_info->empty_zones); + else + clear_bit(zno, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, true); +} + +static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, false); +} + +#endif -- cgit v1.2.3 From b70f509774ad4b75d4253ad23b65c35d89402026 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:08 +0900 Subject: btrfs: check and enable ZONED mode Introduce function btrfs_check_zoned_mode() to check if ZONED flag is enabled on the file system and if the file system consists of zoned devices with equal zone size. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 14 +++++++++ fs/btrfs/dev-replace.c | 7 +++++ fs/btrfs/disk-io.c | 10 +++++++ fs/btrfs/super.c | 1 + fs/btrfs/volumes.c | 5 ++++ fs/btrfs/zoned.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 26 ++++++++++++++++ 7 files changed, 143 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c0c6e79c43f9..8f20219e2caa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -956,6 +956,15 @@ struct btrfs_fs_info { /* Type of exclusive operation running */ unsigned long exclusive_operation; + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + union { + u64 zone_size; + u64 zoned; + }; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -3677,4 +3686,9 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zoned != 0; +} + #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 71a1bddcbc76..a98e33f232d5 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -260,6 +260,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + btrfs_err(fs_info, + "dev-replace: zoned type of target device mismatch with filesystem"); + ret = -EINVAL; + goto error; + } + sync_blockdev(bdev); list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 37b2e0aad162..32e29e2bc99a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "block-group.h" #include "discard.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -3046,6 +3047,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); + fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); + /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -3202,6 +3205,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_block_groups; } + ret = btrfs_check_zoned_mode(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize zoned mode: %d", + ret); + goto fail_block_groups; + } + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3c39e363aec5..b754205f21d0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -44,6 +44,7 @@ #include "backref.h" #include "space-info.h" #include "sysfs.h" +#include "zoned.h" #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f250bf4dc91f..cf3f4975107f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2515,6 +2515,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 38d7bf45b83f..56bb701e3b53 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -166,3 +166,83 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } + +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 zoned_devices = 0; + u64 nr_devices = 0; + u64 zone_size = 0; + const bool incompat_zoned = btrfs_is_zoned(fs_info); + int ret = 0; + + /* Count zoned devices */ + list_for_each_entry(device, &fs_devices->devices, dev_list) { + enum blk_zoned_model model; + + if (!device->bdev) + continue; + + model = bdev_zoned_model(device->bdev); + if (model == BLK_ZONED_HM || + (model == BLK_ZONED_HA && incompat_zoned)) { + zoned_devices++; + if (!zone_size) { + zone_size = device->zone_info->zone_size; + } else if (device->zone_info->zone_size != zone_size) { + btrfs_err(fs_info, + "zoned: unequal block device zone sizes: have %llu found %llu", + device->zone_info->zone_size, + zone_size); + ret = -EINVAL; + goto out; + } + } + nr_devices++; + } + + if (!zoned_devices && !incompat_zoned) + goto out; + + if (!zoned_devices && incompat_zoned) { + /* No zoned block device found on ZONED filesystem */ + btrfs_err(fs_info, + "zoned: no zoned devices found on a zoned filesystem"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices && !incompat_zoned) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices != nr_devices) { + btrfs_err(fs_info, + "zoned: cannot mix zoned and regular devices"); + ret = -EINVAL; + goto out; + } + + /* + * stripe_size is always aligned to BTRFS_STRIPE_LEN in + * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * check the alignment here. + */ + if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, + "zoned: zone size %llu not aligned to stripe %u", + zone_size, BTRFS_STRIPE_LEN); + ret = -EINVAL; + goto out; + } + + fs_info->zone_size = zone_size; + + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); +out: + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 00c871f09d03..e4ee1a50be2d 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,6 +4,7 @@ #define BTRFS_ZONED_H #include +#include struct btrfs_zoned_device_info { /* @@ -22,6 +23,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -36,6 +38,15 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } +static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_err(fs_info, "zoned block devices support is not enabled"); + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -84,4 +95,19 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p btrfs_dev_set_empty_zone_bit(device, pos, false); } +static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, + struct block_device *bdev) +{ + u64 zone_size; + + if (btrfs_is_zoned(fs_info)) { + zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; + /* Do not allow non-zoned device */ + return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + } + + /* Do not allow Host Manged zoned device */ + return bdev_zoned_model(bdev) != BLK_ZONED_HM; +} + #endif -- cgit v1.2.3 From 862931c76327e54d49c30d80c333f552dca18489 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:09 +0900 Subject: btrfs: introduce max_zone_append_size The zone append write command has a maximum IO size restriction it accepts. This is because a zone append write command cannot be split, as we ask the device to place the data into a specific target zone and the device responds with the actual written location of the data. Introduce max_zone_append_size to zone_info and fs_info to track the value, so we can limit all I/O to a zoned block device that we want to write using the zone append command to the device's limits. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/zoned.c | 17 +++++++++++++++-- fs/btrfs/zoned.h | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f20219e2caa..7fe74f6a2486 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -965,6 +965,9 @@ struct btrfs_fs_info { u64 zoned; }; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 56bb701e3b53..f984cf9063ee 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -47,6 +47,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) { struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; + struct request_queue *queue = bdev_get_queue(bdev); sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -70,6 +71,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->max_zone_append_size = + (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -174,6 +177,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; + u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_is_zoned(fs_info); int ret = 0; @@ -187,10 +191,13 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) model = bdev_zoned_model(device->bdev); if (model == BLK_ZONED_HM || (model == BLK_ZONED_HA && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = device->zone_info; zoned_devices++; if (!zone_size) { - zone_size = device->zone_info->zone_size; - } else if (device->zone_info->zone_size != zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", device->zone_info->zone_size, @@ -198,6 +205,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; } nr_devices++; } @@ -241,6 +253,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = max_zone_append_size; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); out: diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e4ee1a50be2d..bb2afc7320a4 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -13,6 +13,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned long *seq_zones; unsigned long *empty_zones; -- cgit v1.2.3 From 5d1ab66c56fed152acbbac1933b16d33ebd47d7f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:10 +0900 Subject: btrfs: disallow space_cache in ZONED mode As updates to the space cache v1 are in-place, the space cache cannot be located over sequential zones and there is no guarantees that the device will have enough conventional zones to store this cache. Resolve this problem by disabling completely the space cache v1. This does not introduce any problems with sequential block groups: all the free space is located after the allocation pointer and no free space before the pointer. There is no need to have such cache. Note: we can technically use free-space-tree (space cache v2) on ZONED mode. But, since ZONED mode now always allocates extents in a block group sequentially regardless of underlying device zone type, it's no use to enable and maintain the tree. For the same reason, NODATACOW is also disabled. In summary, ZONED will disable: | Disabled features | Reason | |-------------------+-----------------------------------------------------| | RAID/DUP | Cannot handle two zone append writes to different | | | zones | |-------------------+-----------------------------------------------------| | space_cache (v1) | In-place updating | | NODATACOW | In-place updating | |-------------------+-----------------------------------------------------| | fallocate | Reserved extent will be a write hole | |-------------------+-----------------------------------------------------| | MIXED_BG | Allocated metadata region will be write holes for | | | data writes | Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 13 +++++++++++-- fs/btrfs/zoned.c | 17 +++++++++++++++++ fs/btrfs/zoned.h | 6 ++++++ 3 files changed, 34 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b754205f21d0..7b2970c90ebe 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -565,8 +565,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (cache_gen) - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + else if (cache_gen) { + if (btrfs_is_zoned(info)) { + btrfs_info(info, + "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(info->super_copy, 0); + } else { + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + } + } /* * Even the options are empty, we still need to do extra check @@ -1025,6 +1032,8 @@ out: ret = -EINVAL; } + if (!ret) + ret = btrfs_check_mountopts_zoned(info); if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f984cf9063ee..3bc6a5e3a61c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -259,3 +259,20 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) out: return ret; } + +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + if (!btrfs_is_zoned(info)) + return 0; + + /* + * Space cache writing is not COWed. Disable that to avoid write errors + * in sequential zones. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_err(info, "zoned: space cache v1 is not supported"); + return -EINVAL; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb2afc7320a4..af01070e185b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -25,6 +25,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -48,6 +49,11 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) -- cgit v1.2.3 From d206e9c9c576a0de2e6d1fdf17551e2a548955c0 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:11 +0900 Subject: btrfs: disallow NODATACOW in ZONED mode NODATACOW implies overwriting the file data on a device, which is impossible in sequential required zones. Disable NODATACOW globally with mount option and per-file NODATACOW attribute by masking FS_NOCOW_FL. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 13 +++++++++++++ fs/btrfs/zoned.c | 5 +++++ 2 files changed, 18 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5dc7cc5d705..0b5e1df24906 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -193,6 +193,15 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } +static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, + unsigned int flags) +{ + if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) + return -EPERM; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); @@ -230,6 +239,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) goto out_unlock; + ret = check_fsflags_compatible(fs_info, fsflags); + if (ret) + goto out_unlock; + binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 3bc6a5e3a61c..6ef97996e185 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -274,5 +274,10 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) return -EINVAL; } + if (btrfs_test_opt(info, NODATACOW)) { + btrfs_err(info, "zoned: NODATACOW not supported"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From f1569c4c10a1e9320b92486d73043c6138859cc5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:12 +0900 Subject: btrfs: disable fallocate in ZONED mode fallocate() is implemented by reserving actual extent instead of reservations. This can result in exposing the sequential write constraint of host-managed zoned block devices to the application, which would break the POSIX semantic for the fallocated file. To avoid this, report fallocate() as not supported when in ZONED mode for now. In the future, we may be able to implement "in-memory" fallocate() in ZONED mode by utilizing space_info->bytes_may_use or similar, so this returns EOPNOTSUPP. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d808928dded6..0e41459b8de6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3308,6 +3308,10 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; + /* Do not allow fallocate in ZONED mode */ + if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + return -EOPNOTSUPP; + alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); cur_offset = alloc_start; -- cgit v1.2.3 From a589dde0bc0bf5616e92131d803b6046573449e6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:13 +0900 Subject: btrfs: disallow mixed-bg in ZONED mode Placing both data and metadata in a block group is impossible in ZONED mode. For data, we can allocate a space for it and write it immediately after the allocation. For metadata, however, we cannot do that, because the logical addresses are recorded in other metadata buffers to build up the trees. As a result, a data buffer can be placed after a metadata buffer, which is not written yet. Writing out the data buffer will break the sequential write rule. Check and disallow MIXED_BG with ZONED mode. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 6ef97996e185..7814f6b3ff68 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -252,6 +252,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) goto out; } + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "zoned: mixed block groups not supported"); + ret = -EINVAL; + goto out; + } + fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; -- cgit v1.2.3 From 12659251ca5df05a484eb122c2c34c18d84e797c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:14 +0900 Subject: btrfs: implement log-structured superblock for ZONED mode Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, it reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - The primary superblock: zones 0 and 1 - The first copy: zones 16 and 17 - The second copy: zones 1024 or zone at 256GB which is minimum, and next to it If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++ fs/btrfs/disk-io.c | 42 +++++-- fs/btrfs/scrub.c | 3 + fs/btrfs/volumes.c | 20 ++- fs/btrfs/zoned.c | 327 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 40 ++++++ 6 files changed, 429 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ccc3271c20ca..9c3523b6e016 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1679,6 +1679,7 @@ out: static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; + const bool zoned = btrfs_is_zoned(fs_info); u64 bytenr; u64 *logical; int stripe_len; @@ -1700,6 +1701,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (ret) return ret; + /* Shouldn't have super stripes in sequential zones */ + if (zoned && nr) { + btrfs_err(fs_info, + "zoned: block group %llu must not contain super block", + cache->start); + return -EUCLEAN; + } + while (nr--) { u64 len = min_t(u64, stripe_len, cache->start + cache->length - logical[nr]); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 32e29e2bc99a..aa92c0de0cd6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3488,10 +3488,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, { struct btrfs_super_block *super; struct page *page; - u64 bytenr; + u64 bytenr, bytenr_orig; struct address_space *mapping = bdev->bd_inode->i_mapping; + int ret; + + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret == -ENOENT) + return ERR_PTR(-EINVAL); + else if (ret) + return ERR_PTR(ret); - bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return ERR_PTR(-EINVAL); @@ -3505,7 +3512,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_PTR(-ENODATA); } - if (btrfs_super_bytenr(super) != bytenr) { + if (btrfs_super_bytenr(super) != bytenr_orig) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -3560,7 +3567,8 @@ static int write_dev_supers(struct btrfs_device *device, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int errors = 0; - u64 bytenr; + int ret; + u64 bytenr, bytenr_orig; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3572,12 +3580,22 @@ static int write_dev_supers(struct btrfs_device *device, struct bio *bio; struct btrfs_super_block *disk_super; - bytenr = btrfs_sb_offset(i); + bytenr_orig = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + btrfs_err(device->fs_info, + "couldn't get super block location for mirror %d", + i); + errors++; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - btrfs_set_super_bytenr(sb, bytenr); + btrfs_set_super_bytenr(sb, bytenr_orig); crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, @@ -3622,6 +3640,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); + btrfs_advance_sb_log(device, i); } return errors < i ? 0 : -1; } @@ -3638,6 +3657,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) int i; int errors = 0; bool primary_failed = false; + int ret; u64 bytenr; if (max_mirrors == 0) @@ -3646,7 +3666,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) for (i = 0; i < max_mirrors; i++) { struct page *page; - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, READ, &bytenr); + if (ret == -ENOENT) { + break; + } else if (ret < 0) { + errors++; + if (i == 0) + primary_failed = true; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cb50e4cb97d5..78759bc9c980 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -20,6 +20,7 @@ #include "rcu-string.h" #include "raid56.h" #include "block-group.h" +#include "zoned.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -3732,6 +3733,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; + if (!btrfs_check_super_location(scrub_dev, bytenr)) + continue; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cf3f4975107f..fd9263402b8a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1276,7 +1276,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) } static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr) + u64 bytenr, u64 bytenr_orig) { struct btrfs_super_block *disk_super; struct page *page; @@ -1307,7 +1307,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev /* align our pointer to the offset of the super block */ disk_super = p + offset_in_page(bytenr); - if (btrfs_super_bytenr(disk_super) != bytenr || + if (btrfs_super_bytenr(disk_super) != bytenr_orig || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); return ERR_PTR(-EINVAL); @@ -1342,7 +1342,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct block_device *bdev; - u64 bytenr; + u64 bytenr, bytenr_orig; + int ret; lockdep_assert_held(&uuid_mutex); @@ -1352,14 +1353,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ - bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); - disk_super = btrfs_read_disk_super(bdev, bytenr); + bytenr_orig = btrfs_sb_offset(0); + ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + if (ret) + return ERR_PTR(ret); + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -2023,6 +2028,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, if (IS_ERR(disk_super)) continue; + if (bdev_is_zoned(bdev)) { + btrfs_reset_sb_log_zones(bdev, copy_num); + continue; + } + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); page = virt_to_page(disk_super); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7814f6b3ff68..155545180046 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -10,6 +10,9 @@ /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 +/* Number of superblock log zones */ +#define BTRFS_NR_SB_LOG_ZONES 2 + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -19,6 +22,103 @@ static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data return 0; } +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, + u64 *wp_ret) +{ + bool empty[BTRFS_NR_SB_LOG_ZONES]; + bool full[BTRFS_NR_SB_LOG_ZONES]; + sector_t sector; + + ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && + zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); + + empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); + empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); + full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); + full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); + + /* + * Possible states of log buffer zones + * + * Empty[0] In use[0] Full[0] + * Empty[1] * x 0 + * In use[1] 0 x 0 + * Full[1] 1 1 C + * + * Log position: + * *: Special case, no superblock is written + * 0: Use write pointer of zones[0] + * 1: Use write pointer of zones[1] + * C: Compare super blcoks from zones[0] and zones[1], use the latest + * one determined by generation + * x: Invalid state + */ + + if (empty[0] && empty[1]) { + /* Special case to distinguish no superblock to read */ + *wp_ret = zones[0].start << SECTOR_SHIFT; + return -ENOENT; + } else if (full[0] && full[1]) { + /* Compare two super blocks */ + struct address_space *mapping = bdev->bd_inode->i_mapping; + struct page *page[BTRFS_NR_SB_LOG_ZONES]; + struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; + int i; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + u64 bytenr; + + bytenr = ((zones[i].start + zones[i].len) + << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; + + page[i] = read_cache_page_gfp(mapping, + bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page[i])) { + if (i == 1) + btrfs_release_disk_super(super[0]); + return PTR_ERR(page[i]); + } + super[i] = page_address(page[i]); + } + + if (super[0]->generation > super[1]->generation) + sector = zones[1].start; + else + sector = zones[0].start; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) + btrfs_release_disk_super(super[i]); + } else if (!full[0] && (empty[1] || full[1])) { + sector = zones[0].wp; + } else if (full[0]) { + sector = zones[1].wp; + } else { + return -EUCLEAN; + } + *wp_ret = sector << SECTOR_SHIFT; + return 0; +} + +/* + * The following zones are reserved as the circular buffer on ZONED btrfs. + * - The primary superblock: zones 0 and 1 + * - The first copy: zones 16 and 17 + * - The second copy: zones 1024 or zone at 256GB which is minimum, and + * the following one + */ +static inline u32 sb_zone_number(int shift, int mirror) +{ + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + + switch (mirror) { + case 0: return 0; + case 1: return 16; + case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + } + + return 0; +} + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { @@ -122,6 +222,52 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + /* Validate superblock log */ + nr_zones = BTRFS_NR_SB_LOG_ZONES; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_wp; + int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; + + sb_zone = sb_zone_number(zone_info->zone_size_shift, i); + if (sb_zone + 1 >= zone_info->nr_zones) + continue; + + sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, + &zone_info->sb_zones[sb_pos], + &nr_zones); + if (ret) + goto out; + + if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read super block log zone info at devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + + /* + * If zones[0] is conventional, always use the beggining of the + * zone to record superblock. No need to validate in that case. + */ + if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == + BLK_ZONE_TYPE_CONVENTIONAL) + continue; + + ret = sb_write_pointer(device->bdev, + &zone_info->sb_zones[sb_pos], &sb_wp); + if (ret != -ENOENT && ret) { + btrfs_err_in_rcu(device->fs_info, + "zoned: super block log zone corrupted devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + } + + kfree(zones); device->zone_info = zone_info; @@ -287,3 +433,184 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) return 0; } + +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, + int rw, u64 *bytenr_ret) +{ + u64 wp; + int ret; + + if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { + *bytenr_ret = zones[0].start << SECTOR_SHIFT; + return 0; + } + + ret = sb_write_pointer(bdev, zones, &wp); + if (ret != -ENOENT && ret < 0) + return ret; + + if (rw == WRITE) { + struct blk_zone *reset = NULL; + + if (wp == zones[0].start << SECTOR_SHIFT) + reset = &zones[0]; + else if (wp == zones[1].start << SECTOR_SHIFT) + reset = &zones[1]; + + if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + ASSERT(reset->cond == BLK_ZONE_COND_FULL); + + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + reset->start, reset->len, + GFP_NOFS); + if (ret) + return ret; + + reset->cond = BLK_ZONE_COND_EMPTY; + reset->wp = reset->start; + } + } else if (ret != -ENOENT) { + /* For READ, we want the precious one */ + if (wp == zones[0].start << SECTOR_SHIFT) + wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; + wp -= BTRFS_SUPER_INFO_SIZE; + } + + *bytenr_ret = wp; + return 0; + +} + +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret) +{ + struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; + unsigned int zone_sectors; + u32 sb_zone; + int ret; + u64 zone_size; + u8 zone_sectors_shift; + sector_t nr_sectors; + u32 nr_zones; + + if (!bdev_is_zoned(bdev)) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + ASSERT(rw == READ || rw == WRITE); + + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) + return -EINVAL; + zone_size = zone_sectors << SECTOR_SHIFT; + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev->bd_part->nr_sects; + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, + BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, + zones); + if (ret < 0) + return ret; + if (ret != BTRFS_NR_SB_LOG_ZONES) + return -EIO; + + return sb_log_location(bdev, zones, rw, bytenr_ret); +} + +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zone_num; + + if (!zinfo) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return -ENOENT; + + return sb_log_location(device->bdev, + &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], + rw, bytenr_ret); +} + +static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, + int mirror) +{ + u32 zone_num; + + if (!zinfo) + return false; + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return false; + + if (!test_bit(zone_num, zinfo->seq_zones)) + return false; + + return true; +} + +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + struct blk_zone *zone; + + if (!is_sb_log_zone(zinfo, mirror)) + return; + + zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; + if (zone->cond != BLK_ZONE_COND_FULL) { + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; + + return; + } + + zone++; + ASSERT(zone->cond != BLK_ZONE_COND_FULL); + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; +} + +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + sector_t zone_sectors; + sector_t nr_sectors; + u8 zone_sectors_shift; + u32 sb_zone; + u32 nr_zones; + + zone_sectors = bdev_zone_sectors(bdev); + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev->bd_part->nr_sects; + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sb_zone << zone_sectors_shift, + zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index af01070e185b..8abe2f83272b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -5,6 +5,8 @@ #include #include +#include "volumes.h" +#include "disk-io.h" struct btrfs_zoned_device_info { /* @@ -17,6 +19,7 @@ struct btrfs_zoned_device_info { u32 nr_zones; unsigned long *seq_zones; unsigned long *empty_zones; + struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; #ifdef CONFIG_BLK_DEV_ZONED @@ -26,6 +29,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret); +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret); +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -54,6 +63,28 @@ static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) return 0; } +static inline int btrfs_sb_log_location_bdev(struct block_device *bdev, + int mirror, int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, + int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ } + +static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -117,4 +148,13 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i return bdev_zoned_model(bdev) != BLK_ZONED_HM; } +static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) +{ + /* + * On a non-zoned device, any address is OK. On a zoned device, + * non-SEQUENTIAL WRITE REQUIRED zones are capable. + */ + return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); +} + #endif -- cgit v1.2.3 From 1201b58b67b3642fd8cafa3604402bee40df1a6d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Nov 2020 15:41:27 +0100 Subject: btrfs: drop casts of bio bi_sector Since commit 72deb455b5ec ("block: remove CONFIG_LBDAF") (5.2) the sector_t type is u64 on all arches and configs so we don't need to typecast it. It used to be unsigned long and the result of sector size shifts were not guaranteed to fit in the type. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 3 +-- fs/btrfs/compression.c | 4 ++-- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 6 +++--- fs/btrfs/inode.c | 7 +++---- fs/btrfs/raid56.c | 8 ++++---- fs/btrfs/volumes.c | 4 ++-- 7 files changed, 16 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e90e92e4d0b8..6ff44e53814c 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2692,8 +2692,7 @@ static void __btrfsic_submit_bio(struct bio *bio) BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", bio_op(bio), bio->bi_opf, segs, - (unsigned long long)bio->bi_iter.bi_sector, - dev_bytenr, bio->bi_disk); + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4e022ed72d2f..12d50f1cdc58 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -218,7 +218,7 @@ static void end_compressed_bio_read(struct bio *bio) inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), bio, - (u64)bio->bi_iter.bi_sector << 9); + bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -620,7 +620,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned long pg_index; struct page *page; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; + u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f514304b23bf..569d50ccf78a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2886,7 +2886,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_status, + bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6086e4cff203..8fa98d55fcfd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -315,7 +315,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path->skip_locking = 1; } - disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; + disk_bytenr = bio->bi_iter.bi_sector << 9; bio_for_each_segment(bvec, bio, iter) { page_bytes_left = bvec.bv_len; @@ -560,7 +560,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, else offset = 0; /* shut up gcc */ - sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; + sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; @@ -599,7 +599,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, ordered = btrfs_lookup_ordered_extent(inode, offset); ASSERT(ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + sums->bytenr = (bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 347076f1da6e..d003f806f02a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2183,7 +2183,7 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -7820,8 +7820,7 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), - bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) { @@ -7913,7 +7912,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->inode = inode; dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); return dip; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 255490f42b5d..93fbf87bdc8d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1097,7 +1097,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << 9; last_end += last->bi_iter.bi_size; /* @@ -1163,7 +1163,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) struct bvec_iter iter; int i = 0; - start = (u64)bio->bi_iter.bi_sector << 9; + start = bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; @@ -1374,7 +1374,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; int i; for (i = 0; i < rbio->nr_data; i++) { @@ -2150,7 +2150,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, if (rbio->faila == -1) { btrfs_warn(fs_info, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", - __func__, (u64)bio->bi_iter.bi_sector << 9, + __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bbio->map_type); if (generic_io) btrfs_put_bbio(bbio); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fd9263402b8a..7930e1c78c45 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6376,7 +6376,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, bio->bi_iter.bi_sector = physical >> 9; btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, + bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); @@ -6408,7 +6408,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; -- cgit v1.2.3 From ec7d6dfd73b2de1c6bc36f832542061b0ca0e0ff Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 26 Nov 2020 15:10:37 +0200 Subject: btrfs: move btrfs_find_highest_objectid/btrfs_find_free_objectid to disk-io.c Those functions are going to be used even after inode cache is removed so moved them to a more appropriate place. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.h | 2 ++ fs/btrfs/inode-map.c | 55 ---------------------------------------------------- fs/btrfs/inode-map.h | 3 --- 4 files changed, 57 insertions(+), 58 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa92c0de0cd6..3b456c3331a2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4735,3 +4735,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } + +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + btrfs_warn(root->fs_info, + "the objectid of root %llu reaches its highest value", + root->root_key.objectid); + ret = -ENOSPC; + goto out; + } + + *objectid = ++root->highest_objectid; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 40e81d6e481e..984410144a97 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -131,6 +131,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8cf39402b227..2bc342fc5d7e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -525,58 +525,3 @@ out: extent_changeset_free(data_reserved); return ret; } - -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) -{ - struct btrfs_path *path; - int ret; - struct extent_buffer *l; - struct btrfs_key search_key; - struct btrfs_key found_key; - int slot; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - search_key.objectid = BTRFS_LAST_FREE_OBJECTID; - search_key.type = -1; - search_key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) - goto error; - BUG_ON(ret == 0); /* Corruption */ - if (path->slots[0] > 0) { - slot = path->slots[0] - 1; - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); - } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) -{ - int ret; - mutex_lock(&root->objectid_mutex); - - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { - btrfs_warn(root->fs_info, - "the objectid of root %llu reaches its highest value", - root->root_key.objectid); - ret = -ENOSPC; - goto out; - } - - *objectid = ++root->highest_objectid; - ret = 0; -out: - mutex_unlock(&root->objectid_mutex); - return ret; -} diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index 7a962811dffe..629baf9aefb1 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -10,7 +10,4 @@ int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans); -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); - #endif -- cgit v1.2.3 From abadc1fcd72e887a8f875dabe4a07aa8c28ac8af Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 26 Nov 2020 15:10:38 +0200 Subject: btrfs: replace calls to btrfs_find_free_ino with btrfs_find_free_objectid The former is going away as part of the inode map removal so switch callers to btrfs_find_free_objectid. No functional changes since with INODE_MAP disabled (default) find_free_objectid was called anyway. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d003f806f02a..a2947a00d442 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6364,7 +6364,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6428,7 +6428,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6572,7 +6572,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_fail; @@ -9071,7 +9071,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) return ret; @@ -9534,7 +9534,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9868,7 +9868,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) goto out; -- cgit v1.2.3 From 5297199a8bca12b8b96afcbf2341605efb6005de Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 26 Nov 2020 15:10:39 +0200 Subject: btrfs: remove inode number cache feature It's been deprecated since commit b547a88ea577 ("btrfs: start deprecation of mount option inode_cache") which enumerates the reasons. A filesystem that uses the feature (mount -o inode_cache) tracks the inode numbers in bitmaps, that data stay on the filesystem after this patch. The size is roughly 5MiB for 1M inodes [1], which is considered small enough to be left there. Removal of the change can be implemented in btrfs-progs if needed. [1] https://lore.kernel.org/linux-btrfs/20201127145836.GZ6430@twin.jikos.cz/ Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 15 +- fs/btrfs/disk-io.c | 23 -- fs/btrfs/free-space-cache.c | 177 --------------- fs/btrfs/free-space-cache.h | 12 - fs/btrfs/inode-map.c | 527 -------------------------------------------- fs/btrfs/inode-map.h | 13 -- fs/btrfs/inode.c | 11 - fs/btrfs/ioctl.c | 1 - fs/btrfs/relocation.c | 1 - fs/btrfs/super.c | 13 +- fs/btrfs/transaction.c | 19 -- fs/btrfs/tree-log.c | 1 - 13 files changed, 6 insertions(+), 809 deletions(-) delete mode 100644 fs/btrfs/inode-map.c delete mode 100644 fs/btrfs/inode-map.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0497fdc37f90..9f1b1a88e317 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - file-item.o inode-item.o inode-map.o disk-io.o \ + file-item.o inode-item.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7fe74f6a2486..a0c588e5fc89 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1077,15 +1077,6 @@ struct btrfs_root { spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; - /* free ino cache stuff */ - struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type ino_cache_state; - spinlock_t ino_cache_lock; - wait_queue_head_t ino_cache_wait; - struct btrfs_free_space_ctl *free_ino_pinned; - u64 ino_cache_progress; - struct inode *ino_cache_inode; - struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1359,7 +1350,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +/* bit 17 is free */ #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) @@ -1406,9 +1397,7 @@ do { \ * transaction commit) */ -#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) -#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) -#define BTRFS_PENDING_COMMIT (2) +#define BTRFS_PENDING_COMMIT (0) #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3b456c3331a2..2d8bcd075c77 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" @@ -1336,14 +1335,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) int ret; unsigned int nofs_flag; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; - goto fail; - } - /* * We might be called under a transaction (e.g. indirect backref * resolution) which could deadlock if it triggers memory reclaim @@ -1360,10 +1351,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_check_and_init_root_item(&root->root_item); } - btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->ino_cache_lock); - init_waitqueue_head(&root->ino_cache_wait); - /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M @@ -2033,8 +2020,6 @@ void btrfs_put_root(struct btrfs_root *root) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); @@ -3988,14 +3973,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, } } - if (root->free_ino_pinned) - __btrfs_remove_free_space_cache(root->free_ino_pinned); - if (root->free_ino_ctl) - __btrfs_remove_free_space_cache(root->free_ino_ctl); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (drop_ref) btrfs_put_root(root); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 572c75d2169b..7d1ceeeb3067 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "inode-map.h" #include "volumes.h" #include "space-info.h" #include "delalloc-space.h" @@ -37,10 +36,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); @@ -1222,14 +1217,6 @@ out: } -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path) -{ - return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0); -} - int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) @@ -3807,170 +3794,6 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, return ret; } -/* - * Find the left-most item in the cache tree, and then return the - * smallest inode number in the item. - * - * Note: the returned inode number may not be the smallest one in - * the tree, if the left-most item is a bitmap. - */ -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) -{ - struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; - struct btrfs_free_space *entry = NULL; - u64 ino = 0; - - spin_lock(&ctl->tree_lock); - - if (RB_EMPTY_ROOT(&ctl->free_space_offset)) - goto out; - - entry = rb_entry(rb_first(&ctl->free_space_offset), - struct btrfs_free_space, offset_index); - - if (!entry->bitmap) { - ino = entry->offset; - - unlink_free_space(ctl, entry); - entry->offset++; - entry->bytes--; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } else { - u64 offset = 0; - u64 count = 1; - int ret; - - ret = search_bitmap(ctl, entry, &offset, &count, true); - /* Logic error; Should be empty if it can't find anything */ - ASSERT(!ret); - - ino = offset; - bitmap_clear_bits(ctl, entry, offset, 1); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } -out: - spin_unlock(&ctl->tree_lock); - - return ino; -} - -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct inode *inode = NULL; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_inode) - inode = igrab(root->ino_cache_inode); - spin_unlock(&root->ino_cache_lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, 0); - if (IS_ERR(inode)) - return inode; - - spin_lock(&root->ino_cache_lock); - if (!btrfs_fs_closing(root->fs_info)) - root->ino_cache_inode = igrab(inode); - spin_unlock(&root->ino_cache_lock); - - return inode; -} - -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - return __create_free_space_inode(root, trans, path, - BTRFS_FREE_INO_OBJECTID, 0); -} - -int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - int ret = 0; - u64 root_gen = btrfs_root_generation(&root->root_item); - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - goto out; - - if (root_gen != BTRFS_I(inode)->generation) - goto out_put; - - ret = __load_free_space_cache(root, inode, ctl, path, 0); - - if (ret < 0) - btrfs_err(fs_info, - "failed to load free ino cache for root %llu", - root->root_key.objectid); -out_put: - iput(inode); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - int ret; - struct btrfs_io_ctl io_ctl; - bool release_metadata = true; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - memset(&io_ctl, 0, sizeof(io_ctl)); - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans); - if (!ret) { - /* - * At this point writepages() didn't error out, so our metadata - * reservation is released when the writeback finishes, at - * inode.c:btrfs_finish_ordered_io(), regardless of it finishing - * with or without an error. - */ - release_metadata = false; - ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path); - } - - if (ret) { - if (release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size, true); - btrfs_debug(fs_info, - "failed to write free ino cache for root %llu error %d", - root->root_key.objectid, ret); - } - - return ret; -} - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index bf8d127d2407..66a87242690e 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -97,17 +97,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path); -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path); -int load_free_ino_cache(struct btrfs_fs_info *fs_info, - struct btrfs_root *root); -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode); void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); @@ -127,7 +116,6 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); int btrfs_find_space_cluster(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c deleted file mode 100644 index 2bc342fc5d7e..000000000000 --- a/fs/btrfs/inode-map.c +++ /dev/null @@ -1,527 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include -#include - -#include "ctree.h" -#include "disk-io.h" -#include "free-space-cache.h" -#include "inode-map.h" -#include "transaction.h" -#include "delalloc-space.h" - -static void fail_caching_thread(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_ERROR; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); -} - -static int caching_kthread(void *data) -{ - struct btrfs_root *root = data; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - u64 last = (u64)-1; - int slot; - int ret; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) { - fail_caching_thread(root); - return -ENOMEM; - } - - /* Since the commit root is read-only, we can safely skip locking. */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; -again: - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - if (btrfs_fs_closing(fs_info)) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - if (WARN_ON(btrfs_header_nritems(leaf) == 0)) - break; - - /* - * Save the key so we can advances forward - * in the next search. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); - schedule_timeout(1); - goto again; - } else - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.type != BTRFS_INODE_ITEM_KEY) - goto next; - - if (key.objectid >= root->highest_objectid) - break; - - if (last != (u64)-1 && last + 1 != key.objectid) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - key.objectid - last - 1, 0); - wake_up(&root->ino_cache_wait); - } - - last = key.objectid; -next: - path->slots[0]++; - } - - if (last < root->highest_objectid - 1) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - root->highest_objectid - last - 1, 0); - } - - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - - root->ino_cache_progress = (u64)-1; - btrfs_unpin_free_ino(root); -out: - wake_up(&root->ino_cache_wait); - up_read(&fs_info->commit_root_sem); - - btrfs_free_path(path); - - return ret; -} - -static void start_caching(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct task_struct *tsk; - int ret; - u64 objectid; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_NO) { - spin_unlock(&root->ino_cache_lock); - return; - } - - root->ino_cache_state = BTRFS_CACHE_STARTED; - spin_unlock(&root->ino_cache_lock); - - ret = load_free_ino_cache(fs_info, root); - if (ret == 1) { - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); - return; - } - - /* - * It can be quite time-consuming to fill the cache by searching - * through the extent tree, and this can keep ino allocation path - * waiting. Therefore at start we quickly find out the highest - * inode number and we know we can use inode numbers which fall in - * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. - */ - ret = btrfs_find_free_objectid(root, &objectid); - if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { - __btrfs_add_free_space(fs_info, ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1, - 0); - wake_up(&root->ino_cache_wait); - } - - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", - root->root_key.objectid); - if (IS_ERR(tsk)) - fail_caching_thread(root); -} - -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) -{ - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return btrfs_find_free_objectid(root, objectid); - -again: - *objectid = btrfs_find_ino_for_alloc(root); - - if (*objectid != 0) - return 0; - - start_caching(root); - - wait_event(root->ino_cache_wait, - root->ino_cache_state == BTRFS_CACHE_FINISHED || - root->ino_cache_state == BTRFS_CACHE_ERROR || - root->free_ino_ctl->free_space > 0); - - if (root->ino_cache_state == BTRFS_CACHE_FINISHED && - root->free_ino_ctl->free_space == 0) - return -ENOSPC; - else if (root->ino_cache_state == BTRFS_CACHE_ERROR) - return btrfs_find_free_objectid(root, objectid); - else - goto again; -} - -void btrfs_return_ino(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; -again: - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - } else { - down_write(&fs_info->commit_root_sem); - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->ino_cache_lock); - up_write(&fs_info->commit_root_sem); - goto again; - } - spin_unlock(&root->ino_cache_lock); - - start_caching(root); - - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - - up_write(&fs_info->commit_root_sem); - } -} - -/* - * When a transaction is committed, we'll move those inode numbers which are - * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and - * others will just be dropped, because the commit root we were searching has - * changed. - * - * Must be called with root->fs_info->commit_root_sem held - */ -void btrfs_unpin_free_ino(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; - spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; - struct btrfs_free_space *info; - struct rb_node *n; - u64 count; - - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return; - - while (1) { - spin_lock(rbroot_lock); - n = rb_first(rbroot); - if (!n) { - spin_unlock(rbroot_lock); - break; - } - - info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); /* Logic error */ - - if (info->offset > root->ino_cache_progress) - count = 0; - else - count = min(root->ino_cache_progress - info->offset + 1, - info->bytes); - - rb_erase(&info->offset_index, rbroot); - spin_unlock(rbroot_lock); - if (count) - __btrfs_add_free_space(root->fs_info, ctl, - info->offset, count, 0); - kmem_cache_free(btrfs_free_space_cachep, info); - } -} - -#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_SIZE * 8) - -/* - * The goal is to keep the memory used by the free_ino tree won't - * exceed the memory if we use bitmaps only. - */ -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *n; - int max_ino; - int max_bitmaps; - - n = rb_last(&ctl->free_space_offset); - if (!n) { - ctl->extents_thresh = INIT_THRESHOLD; - return; - } - info = rb_entry(n, struct btrfs_free_space, offset_index); - - /* - * Find the maximum inode number in the filesystem. Note we - * ignore the fact that this can be a bitmap, because we are - * not doing precise calculation. - */ - max_ino = info->bytes - 1; - - max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; - if (max_bitmaps <= ctl->total_bitmaps) { - ctl->extents_thresh = 0; - return; - } - - ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_SIZE / sizeof(*info); -} - -/* - * We don't fall back to bitmap, if we are below the extents threshold - * or this chunk of inode numbers is a big one. - */ -static bool use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - if (ctl->free_extents < ctl->extents_thresh || - info->bytes > INODES_PER_BITMAP / 10) - return false; - - return true; -} - -static const struct btrfs_free_space_op free_ino_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - -static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) -{ -} - -static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - /* - * We always use extents for two reasons: - * - * - The pinned tree is only used during the process of caching - * work. - * - Make code simpler. See btrfs_unpin_free_ino(). - */ - return false; -} - -static const struct btrfs_free_space_op pinned_free_ino_op = { - .recalc_thresholds = pinned_recalc_thresholds, - .use_bitmap = pinned_use_bitmap, -}; - -void btrfs_init_free_ino_ctl(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - spin_lock_init(&ctl->tree_lock); - ctl->unit = 1; - ctl->start = 0; - ctl->private = NULL; - ctl->op = &free_ino_op; - INIT_LIST_HEAD(&ctl->trimming_ranges); - mutex_init(&ctl->cache_writeout_mutex); - - /* - * Initially we allow to use 16K of ram to cache chunks of - * inode numbers before we resort to bitmaps. This is somewhat - * arbitrary, but it will be adjusted in runtime. - */ - ctl->extents_thresh = INIT_THRESHOLD; - - spin_lock_init(&pinned->tree_lock); - pinned->unit = 1; - pinned->start = 0; - pinned->private = NULL; - pinned->extents_thresh = 0; - pinned->op = &pinned_free_ino_op; -} - -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - struct btrfs_block_rsv *rsv; - struct extent_changeset *data_reserved = NULL; - u64 num_bytes; - u64 alloc_hint = 0; - int ret; - int prealloc; - bool retry = false; - - /* only fs tree and subvol/snap needs ino cache */ - if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && - (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || - root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) - return 0; - - /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0) - return 0; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rsv = trans->block_rsv; - trans->block_rsv = &fs_info->trans_block_rsv; - - num_bytes = trans->bytes_reserved; - /* - * 1 item for inode item insertion if need - * 4 items for inode item update (in the worst case) - * 1 items for slack space if we need do truncation - * 1 item for free space object - * 3 items for pre-allocation - */ - trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - trans->bytes_reserved, - BTRFS_RESERVE_NO_FLUSH); - if (ret) - goto out; - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 1); -again: - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { - ret = PTR_ERR(inode); - goto out_release; - } - - if (IS_ERR(inode)) { - BUG_ON(retry); /* Logic error */ - retry = true; - - ret = create_free_ino_inode(root, trans, path); - if (ret) - goto out_release; - goto again; - } - - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) { - if (ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - goto out_put; - } - } - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { - ret = -1; - spin_unlock(&root->ino_cache_lock); - goto out_put; - } - spin_unlock(&root->ino_cache_lock); - - spin_lock(&ctl->tree_lock); - prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_SIZE; - spin_unlock(&ctl->tree_lock); - - /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_SIZE; - - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0, - prealloc); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, - prealloc, prealloc, &alloc_hint); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); - goto out_put; - } - - ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); -out_put: - iput(inode); -out_release: - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved, NULL); -out: - trans->block_rsv = rsv; - trans->bytes_reserved = num_bytes; - - btrfs_free_path(path); - extent_changeset_free(data_reserved); - return ret; -} diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h deleted file mode 100644 index 629baf9aefb1..000000000000 --- a/fs/btrfs/inode-map.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef BTRFS_INODE_MAP_H -#define BTRFS_INODE_MAP_H - -void btrfs_init_free_ino_ctl(struct btrfs_root *root); -void btrfs_unpin_free_ino(struct btrfs_root *root); -void btrfs_return_ino(struct btrfs_root *root, u64 objectid); -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans); - -#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2947a00d442..0ce42d52d53e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,7 +45,6 @@ #include "compression.h" #include "locking.h" #include "free-space-cache.h" -#include "inode-map.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -4214,12 +4213,6 @@ out_up_write: d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); - - /* the last ref */ - if (dest->ino_cache_inode) { - iput(dest->ino_cache_inode); - dest->ino_cache_inode = NULL; - } } return ret; @@ -5295,10 +5288,6 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans); } - if (!(root == fs_info->tree_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) - btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); - free_rsv: btrfs_free_block_rsv(fs_info, rsv); no_delete: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b5e1df24906..703212ff50a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,7 +34,6 @@ #include "print-tree.h" #include "volumes.h" #include "locking.h" -#include "inode-map.h" #include "backref.h" #include "rcu-string.h" #include "send.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6e7e5fe2e277..19b7db8b2117 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -18,7 +18,6 @@ #include "btrfs_inode.h" #include "async-thread.h" #include "free-space-cache.h" -#include "inode-map.h" #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7b2970c90ebe..455d924b8d62 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -338,7 +338,6 @@ enum { Opt_device, Opt_fatal_errors, Opt_flushoncommit, Opt_noflushoncommit, - Opt_inode_cache, Opt_noinode_cache, Opt_max_inline, Opt_barrier, Opt_nobarrier, Opt_datacow, Opt_nodatacow, @@ -371,6 +370,7 @@ enum { /* Deprecated options */ Opt_recovery, + Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ Opt_check_integrity, @@ -880,14 +880,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_inode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and will have no effect from 5.11"); - btrfs_set_pending_and_info(info, INODE_MAP_CACHE, - "enabling inode map caching"); - break; case Opt_noinode_cache: - btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, - "disabling inode map caching"); + btrfs_warn(info, + "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: btrfs_set_and_info(info, CLEAR_CACHE, @@ -1506,8 +1501,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",enospc_debug"); if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(info, INODE_MAP_CACHE)) - seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 238e5a2c0696..6b7156712486 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "locking.h" #include "tree-log.h" -#include "inode-map.h" #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" @@ -163,8 +162,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->root_key.objectid)) - btrfs_unpin_free_ino(root); extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -1342,8 +1339,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_save_ino_cache(root, trans); - /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_mb__after_atomic(); @@ -2436,10 +2431,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) @@ -2460,16 +2451,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) if (!prev) return; - bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; - if (prev & bit) - btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - - bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; - if (prev & bit) - btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - bit = 1 << BTRFS_PENDING_COMMIT; if (prev & bit) btrfs_debug(fs_info, "pending commit done"); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 80cf496226c2..485267ee5bc6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -17,7 +17,6 @@ #include "backref.h" #include "compression.h" #include "qgroup.h" -#include "inode-map.h" #include "block-group.h" #include "space-info.h" -- cgit v1.2.3 From 7dbdb443a7b49f66d9c4da0d810e2c54e0727d82 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 3 Dec 2020 10:09:48 +0200 Subject: btrfs: remove crc_check logic from free space Following removal of the ino cache io_ctl_init will be called only on behalf of the freespace inode. In this case we always want to check CRCs so conditional code that depended on io_ctl::check_crc can be removed. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 37 +++++-------------------------------- fs/btrfs/free-space-cache.h | 1 - 2 files changed, 5 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7d1ceeeb3067..53df853b93df 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -305,16 +305,11 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int write) { int num_pages; - int check_crcs = 0; num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) - check_crcs = 1; - /* Make sure we can fit our crcs and generation into the first page */ - if (write && check_crcs && - (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) + if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -325,7 +320,6 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, io_ctl->num_pages = num_pages; io_ctl->fs_info = btrfs_sb(inode->i_sb); - io_ctl->check_crcs = check_crcs; io_ctl->inode = inode; return 0; @@ -420,13 +414,8 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) * Skip the csum areas. If we don't check crcs then we just have a * 64bit chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); - io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); @@ -440,14 +429,8 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) * Skip the crc area. If we don't check crcs then we just have a 64bit * chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += sizeof(u32) * io_ctl->num_pages; - io_ctl->size -= sizeof(u64) + - (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); cache_gen = get_unaligned_le64(io_ctl->cur); if (cache_gen != generation) { @@ -467,11 +450,6 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_unmap_page(io_ctl); - return; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; @@ -489,11 +467,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_map_page(io_ctl, 0); - return 0; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 66a87242690e..7b18d359a849 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -76,7 +76,6 @@ struct btrfs_io_ctl { int num_pages; int entries; int bitmaps; - unsigned check_crcs:1; }; struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, -- cgit v1.2.3 From f0d1219def15ef14a2ba2f6b7a612773295b3b5c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 3 Dec 2020 10:09:49 +0200 Subject: btrfs: always set NODATASUM/NODATACOW in __create_free_space_inode Since it's being used solely for the freespace cache unconditionally set the flags required for it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 53df853b93df..bc29317978f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -142,17 +142,15 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; + /* We inline CRCs for the free disk space cache */ + const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | + BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; - /* We inline crc's for the free disk space cache */ - if (ino != BTRFS_FREE_INO_OBJECTID) - flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; - leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); -- cgit v1.2.3 From fa598b0696409e3522022a1dddd47a1adc2b994d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 3 Dec 2020 17:18:38 +0100 Subject: btrfs: remove recalc_thresholds from free space ops After removing the inode number cache that was using the free space cache code, we can remove at least the recalc_thresholds callback from the ops. Both code and tests use the same callback function. It's moved before its first use. The use_bitmaps callback is still needed by tests to create some extents/bitmap setup. Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 86 +++++++++++++++++++-------------------- fs/btrfs/free-space-cache.h | 1 - fs/btrfs/tests/free-space-tests.c | 1 - 3 files changed, 42 insertions(+), 46 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bc29317978f4..fe4b2176a6e8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -597,6 +597,44 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, return 0; } +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_block_group *block_group = ctl->private; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->length; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + + max_bitmaps = max_t(u64, max_bitmaps, 1); + + ASSERT(ctl->total_bitmaps <= max_bitmaps); + + /* + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. + */ + if (size < SZ_1G) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); + + bitmap_bytes = ctl->total_bitmaps * ctl->unit; + + /* + * we want the extent entry threshold to always be at most 1/2 the max + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + + ctl->extents_thresh = + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); +} + static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) @@ -715,7 +753,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_err(fs_info, @@ -1632,44 +1670,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_block_group *block_group = ctl->private; - u64 max_bytes; - u64 bitmap_bytes; - u64 extent_bytes; - u64 size = block_group->length; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - - max_bitmaps = max_t(u64, max_bitmaps, 1); - - ASSERT(ctl->total_bitmaps <= max_bitmaps); - - /* - * We are trying to keep the total amount of memory used per 1GiB of - * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation - * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of - * bitmaps, we may end up using more memory than this. - */ - if (size < SZ_1G) - max_bytes = MAX_CACHE_BYTES_PER_GIG; - else - max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - - bitmap_bytes = ctl->total_bitmaps * ctl->unit; - - /* - * we want the extent entry threshold to always be at most 1/2 the max - * bytes we can have, or whatever is less than that. - */ - extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); - - ctl->extents_thresh = - div_u64(extent_bytes, sizeof(struct btrfs_free_space)); -} - static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) @@ -1881,8 +1881,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; - - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static void free_bitmap(struct btrfs_free_space_ctl *ctl, @@ -1904,7 +1903,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, @@ -2071,7 +2070,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } static const struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2991,7 +2989,7 @@ out: kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 7b18d359a849..91d495e31a08 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -60,7 +60,6 @@ struct btrfs_free_space_ctl { }; struct btrfs_free_space_op { - void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); }; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index aebdf23f0cdd..8f05c1eb833f 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -399,7 +399,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, u64 offset; u64 max_extent_size; const struct btrfs_free_space_op test_free_space_ops = { - .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, .use_bitmap = test_use_bitmap, }; const struct btrfs_free_space_op *orig_free_space_ops; -- cgit v1.2.3 From de53d892e5c51dfa0a158e812575a75a6c991f39 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:23 +0000 Subject: btrfs: fix race causing unnecessary inode logging during link and rename When we are doing a rename or a link operation for an inode that was logged in the previous transaction and that transaction is still committing, we have a time window where we incorrectly consider that the inode was logged previously in the current transaction and therefore decide to log it to update it in the log. The following steps give an example on how this happens during a link operation: 1) Inode X is logged in transaction 1000, so its logged_trans field is set to 1000; 2) Task A starts to commit transaction 1000; 3) The state of transaction 1000 is changed to TRANS_STATE_UNBLOCKED; 4) Task B starts a link operation for inode X, and as a consequence it starts transaction 1001; 5) Task A is still committing transaction 1000, therefore the value stored at fs_info->last_trans_committed is still 999; 6) Task B calls btrfs_log_new_name(), it reads a value of 999 from fs_info->last_trans_committed and because the logged_trans field of inode X has a value of 1000, the function does not return immediately, instead it proceeds to logging the inode, which should not happen because the inode was logged in the previous transaction (1000) and not in the current one (1001). This is not a functional problem, just wasted time and space logging an inode that does not need to be logged, contributing to higher latency for link and rename operations. So fix this by comparing the inodes' logged_trans field with the generation of the current transaction instead of comparing with the value stored in fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename operations. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 485267ee5bc6..9ab05ebd8854 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6442,7 +6442,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_log_ctx ctx; /* @@ -6456,8 +6455,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans <= fs_info->last_trans_committed && - (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) + if (inode->logged_trans < trans->transid && + (!old_dir || old_dir->logged_trans < trans->transid)) return; btrfs_init_log_ctx(&ctx, &inode->vfs_inode); -- cgit v1.2.3 From 5f96bfb7633c55b578c6b32f32624061f25010db Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:24 +0000 Subject: btrfs: fix race that results in logging old extents during a fast fsync When logging the extents of an inode during a fast fsync, we have a time window where we can log extents that are from the previous transaction and already persisted. This only makes us waste time unnecessarily. The following sequence of steps shows how this can happen: 1) We are at transaction 1000; 2) An ordered extent E from inode I completes, that is it has gone through btrfs_finish_ordered_io(), and it set the extent maps' generation to 1000 when we unpin the extent, which is the generation of the current transaction; 3) The commit for transaction 1000 starts by task A; 4) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 5) Some change is made to inode I, resulting in creation of a new transaction with a generation of 1001; 6) The transaction 1000 commit starts unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 7) Task B starts an fsync on inode I, and when it gets to btrfs_log_changed_extents() sees the extent map for extent E in the list of modified extents. It sees the extent map has a generation of 1000 and fs_info->last_trans_committed has a value of 999, so it proceeds to logging the respective file extent item and all the checksums covering its range. So we end up wasting time since the extent was already persisted and is reachable through the trees pointed to by the super block committed by transaction 1000. So just fix this by comparing the extent maps generation against the generation of the transaction handle - if it is smaller then the id in the handle, we know the extent was already persisted and we do not need to log it. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ab05ebd8854..6ea7df1beee2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4415,14 +4415,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 test_gen; int ret = 0; int num = 0; INIT_LIST_HEAD(&extents); write_lock(&tree->lock); - test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); @@ -4438,7 +4436,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, goto process; } - if (em->generation <= test_gen) + if (em->generation < trans->transid) continue; /* We log prealloc extents beyond eof later. */ -- cgit v1.2.3 From 4d6221d7d83141d58ece6560e9cfd4cc92eab044 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:25 +0000 Subject: btrfs: fix race that causes unnecessary logging of ancestor inodes When logging an inode and we are checking if we need to log ancestors that are new, if the previous transaction is still committing we have a time window where we can unnecessarily log ancestor inodes that were created in the previous transaction. The race is described by the following steps: 1) We are at transaction 1000; 2) Directory inode X is created, its generation is set to 1000; 3) The commit for transaction 1000 is started by task A; 4) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 5) Inode Y, a regular file, is created under directory inode X, this results in starting a new transaction with a generation of 1001; 6) The transaction 1000 commit is unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 7) Task B calls fsync on inode Y and gets a handle for transaction 1001; 8) Task B ends up at log_all_new_ancestors() and then because inode Y has only one hard link, ends up at log_new_ancestors_fast(). There it reads a value of 999 from fs_info->last_trans_committed, and sees that the parent inode X has a generation of 1000, so we end up logging inode X: if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); (...) which is not necessary since it was created in the past transaction, with a generation of 1000, and that transaction has already committed its super blocks - it's still unpinning extents so it has not yet updated fs_info->last_trans_committed from 999 to 1000. So this just causes us to spend more time logging and allocating and writing more tree blocks for the log tree. So fix this by comparing an inode's generation with the generation of the transaction our transaction handle refers to - if the inode's generation matches the generation of the current transaction than we know it is a new inode we need to log, otherwise don't log it. This case is often hit when running dbench for a long enough duration. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6ea7df1beee2..f21bef62e349 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5834,7 +5834,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - const u64 last_committed = fs_info->last_trans_committed; struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; struct btrfs_key search_key; @@ -5853,7 +5852,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation > last_committed) + if (BTRFS_I(inode)->generation >= trans->transid) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5894,7 +5893,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct dentry *old_parent = NULL; struct super_block *sb = inode->vfs_inode.i_sb; int ret = 0; @@ -5908,7 +5906,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation > fs_info->last_trans_committed) { + if (inode->generation >= trans->transid) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) -- cgit v1.2.3 From 47d3db41e190ca4a9c6e4a848052f4c5ca633db1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:26 +0000 Subject: btrfs: fix race that makes inode logging fallback to transaction commit When logging an inode and the previous transaction is still committing, we have a time window where we can end up incorrectly think an inode has its last_unlink_trans field with a value greater than the last transaction committed, which results in the logging to fallback to a full transaction commit, which is usually much more expensive than doing a log commit. The race is described by the following steps: 1) We are at transaction 1000; 2) We modify an inode X (a directory) using transaction 1000 and set its last_unlink_trans field to 1000, because for example we removed one of its subdirectories; 3) We create a new inode Y with a dentry in inode X using transaction 1000, so its generation field is set to 1000; 4) The commit for transaction 1000 is started by task A; 5) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 6) Some task starts a new transaction with a generation of 1001; 7) We do some modification to inode Y (using transaction 1001); 8) The transaction 1000 commit starts unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 9) Task B starts an fsync on inode Y, and gets a handle for transaction 1001. When it gets to check_parent_dirs_for_sync() it does the checking of the ancestor dentries because the following check does not evaluate to true: if (S_ISREG(inode->vfs_inode.i_mode) && inode->generation <= last_committed && inode->last_unlink_trans <= last_committed) goto out; The generation value for inode Y is 1000 and last_committed, which has the value read from fs_info->last_trans_committed, has a value of 999, so that check evaluates to false and we proceed to check the ancestor inodes. Once we get to the first ancestor, inode X, we call btrfs_must_commit_transaction() on it, which evaluates to true: static bool btrfs_must_commit_transaction(...) { struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); if (inode->last_unlink_trans > fs_info->last_trans_committed) { /* * Make sure any commits to the log are forced to be full * commits. */ btrfs_set_log_full_commit(trans); ret = true; } (...) because inode's X last_unlink_trans has a value of 1000 and fs_info->last_trans_committed still has a value of 999, it returns true to check_parent_dirs_for_sync(), making it return 1 which is propagated up to btrfs_sync_file(), causing it to fallback to a full transaction commit of transaction 1001. We should have not fallen back to commit transaction 1001, since inode X had last_unlink_trans set to 1000 and the super blocks for transaction 1000 were already written. So while not resulting in a functional problem, it leads to a lot more work and higher latencies for a fsync since committing a transaction is usually more expensive than committing a log (if other filesystem changes happened under that transaction). Similar problem happens when logging directories, for the same reason as btrfs_must_commit_transaction() returns true on an inode with its last_unlink_trans having the generation of the previous transaction and that transaction is still committing, unpinning its freed extents. So fix this by comparing last_unlink_trans with the id of the current transaction instead of fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename and rmdir operations (both update the field last_unlink_trans of an inode) and fsyncs of files and directories. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f21bef62e349..312704b20158 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5448,11 +5448,10 @@ out_unlock: static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans > fs_info->last_trans_committed) { + if (inode->last_unlink_trans >= trans->transid) { /* * Make sure any commits to the log are forced to be full * commits. @@ -5474,8 +5473,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - struct super_block *sb, - u64 last_committed) + struct super_block *sb) { int ret = 0; struct dentry *old_parent = NULL; @@ -5487,8 +5485,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) goto out; if (!S_ISDIR(inode->vfs_inode.i_mode)) { @@ -6023,7 +6021,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; int ret = 0; - u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; sb = inode->vfs_inode.i_sb; @@ -6048,8 +6045,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb, - last_committed); + ret = check_parent_dirs_for_sync(trans, inode, parent, sb); if (ret) goto end_no_trans; @@ -6079,8 +6075,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) { + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) { ret = 0; goto end_trans; } @@ -6129,7 +6125,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * but the file inode does not have a matching BTRFS_INODE_REF_KEY item * and has a link count of 2. */ - if (inode->last_unlink_trans > last_committed) { + if (inode->last_unlink_trans >= trans->transid) { ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; -- cgit v1.2.3 From 639bd575b7c7fa326abadd2ef3e374a5a24eb40b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:27 +0000 Subject: btrfs: fix race leading to unnecessary transaction commit when logging inode When logging an inode we may often have to fallback to a full transaction commit, either because a new block group was allocated, there is some case we can not deal with without a transaction commit or some error like an ENOMEM happened. However after we fallback to a transaction commit, we have a time window where we can make the next attempt to log any inode commit the next transaction unnecessarily, adding additional overhead and increasing latency. A sequence of steps that leads to this issue is the following: 1) The current open transaction has a generation of 1000; 2) A new block group is allocated, and as a consequence we must make sure any attempts to commit a log fallback to a transaction commit, so btrfs_set_log_full_commit() is called from btrfs_make_block_group(). This sets fs_info->last_trans_log_full_commit to 1000; 3) Task A is holding a handle on transaction 1000 and tries to log inode X. Once it gets to start_log_trans(), it calls btrfs_need_log_full_commit() which returns true, since fs_info->last_trans_log_full_commit has a value of 1000. So we end up returning EAGAIN and propagating it up to btrfs_sync_file(), where we commit transaction 1000; 4) The transaction commit task (task A) sets the transaction state to unblocked (TRANS_STATE_UNBLOCKED); 5) Some other task, task B, starts a new transaction with a generation of 1001; 6) Some stuff is done with transaction 1001, some btree blocks COWed, etc; 7) Transaction 1000 has not fully committed yet, we are still writing all the extent buffers it created; 8) Some new task, task C, starts an fsync of inode Y, gets a handle for transaction 1001, and it gets to btrfs_log_inode_parent() which does the following check: if (fs_info->last_trans_log_full_commit > last_committed) { ret = 1; goto end_no_trans; } At that point last_trans_log_full_commit has a value of 1000 and last_committed (value of fs_info->last_trans_committed) has a value of 999, since transaction 1000 has not yet committed - it is either still writing out dirty extent buffers, its super blocks or unpinning extents. As a consequence we return 1, which gets propagated up to btrfs_sync_file(), which will then call btrfs_commit_transaction() for transaction 1001. As a consequence we have an unnecessary second transaction commit, we previously committed transaction 1000 and now commit transaction 1001 as well, resulting in more overhead and increased latency. So fix this double transaction commit issue simply by removing that check, because all we need to do is wait for the previous transaction to finish its commit, which we already do later when starting the log transaction at start_log_trans(), because there we acquire the tree_log_mutex lock, which is held by a transaction commit and only released after the transaction commits its super blocks. Another issue that check has is that it reads last_trans_log_full_commit without using READ_ONCE(), which is incorrect since that member of struct btrfs_fs_info is always updated with WRITE_ONCE() through the helper btrfs_set_log_full_commit(). This double transaction commit issue can actually be triggered quite often in long runs of dbench, since besides the creation of new block groups that force inode logging to fallback to a transaction commit, there are cases where dbench asks to fsync a directory which had files in it that were previously renamed or subdirectories that were removed, resulting in the inode logging to fallback to a full transaction commit. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 312704b20158..582911a36ee0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6030,16 +6030,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - /* - * The prev transaction commit doesn't complete, we need do - * full commit by ourselves. - */ - if (fs_info->last_trans_log_full_commit > - fs_info->last_trans_committed) { - ret = 1; - goto end_no_trans; - } - if (btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; -- cgit v1.2.3 From 47876f7ceffa0e6af7476e052b3c061f1f2c1d9f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:28 +0000 Subject: btrfs: do not block inode logging for so long during transaction commit Early on during a transaction commit we acquire the tree_log_mutex and hold it until after we write the super blocks. But before writing the extent buffers dirtied by the transaction and the super blocks we unblock the transaction by setting its state to TRANS_STATE_UNBLOCKED and setting fs_info->running_transaction to NULL. This means that after that and before writing the super blocks, new transactions can start. However if any transaction wants to log an inode, it will block waiting for the transaction commit to write its dirty extent buffers and the super blocks because the tree_log_mutex is only released after those operations are complete, and starting a new log transaction blocks on that mutex (at start_log_trans()). Writing the dirty extent buffers and the super blocks can take a very significant amount of time to complete, but we could allow the tasks wanting to log an inode to proceed with most of their steps: 1) create the log trees 2) log metadata in the trees 3) write their dirty extent buffers They only need to wait for the previous transaction commit to complete (write its super blocks) before they attempt to write their super blocks, otherwise we could end up with a corrupt filesystem after a crash. So change start_log_trans() to use the root tree's log_mutex to serialize for the creation of the log root tree instead of using the tree_log_mutex, and make btrfs_sync_log() acquire the tree_log_mutex before writing the super blocks. This allows for inode logging to wait much less time when there is a previous transaction that is still committing, often not having to wait at all, as by the time when we try to sync the log the previous transaction already wrote its super blocks. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit The following script that uses dbench was used to measure the impact of the whole patchset: $ cat test-dbench.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd" echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f -m single -d single $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT The test was run on a machine with 12 cores, 64G of ram, using a NVMe device and a non-debug kernel configuration (Debian's default). Before patch set: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11277211 0.250 85.340 Close 8283172 0.002 6.479 Rename 477515 1.935 86.026 Unlink 2277936 0.770 87.071 Deltree 256 15.732 81.379 Mkdir 128 0.003 0.009 Qpathinfo 10221180 0.056 44.404 Qfileinfo 1789967 0.002 4.066 Qfsinfo 1874399 0.003 9.176 Sfileinfo 918589 0.061 10.247 Find 3951758 0.341 54.040 WriteX 5616547 0.047 85.079 ReadX 17676028 0.005 9.704 LockX 36704 0.003 1.800 UnlockX 36704 0.002 0.687 Flush 790541 14.115 676.236 Throughput 1179.19 MB/sec 64 clients 64 procs max_latency=676.240 ms After patch set: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 12687926 0.171 86.526 Close 9320780 0.002 8.063 Rename 537253 1.444 78.576 Unlink 2561827 0.559 87.228 Deltree 374 11.499 73.549 Mkdir 187 0.003 0.005 Qpathinfo 11500300 0.061 36.801 Qfileinfo 2017118 0.002 7.189 Qfsinfo 2108641 0.003 4.825 Sfileinfo 1033574 0.008 8.065 Find 4446553 0.408 47.835 WriteX 6335667 0.045 84.388 ReadX 19887312 0.003 9.215 LockX 41312 0.003 1.394 UnlockX 41312 0.002 1.425 Flush 889233 13.014 623.259 Throughput 1339.32 MB/sec 64 clients 64 procs max_latency=623.265 ms +12.7% throughput, -8.2% max latency Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/tree-log.c | 56 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a0c588e5fc89..331554fd6fd1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1038,7 +1038,7 @@ enum { BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, - /* The root has a log tree. Used only for subvolume roots. */ + /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 582911a36ee0..254c2ee43aae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,8 +138,25 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; int ret = 0; + /* + * First check if the log root tree was already created. If not, create + * it before locking the root's log_mutex, just to keep lockdep happy. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { + mutex_lock(&tree_root->log_mutex); + if (!fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, fs_info); + if (!ret) + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + } + mutex_unlock(&tree_root->log_mutex); + if (ret) + return ret; + } + mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,13 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { - mutex_lock(&fs_info->tree_log_mutex); - if (!fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, fs_info); - mutex_unlock(&fs_info->tree_log_mutex); - if (ret) - goto out; - ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -3021,6 +3031,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; + u64 log_root_start; + u64 log_root_level; mutex_lock(&root->log_mutex); log_transid = ctx->log_transid; @@ -3198,22 +3210,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - btrfs_set_super_log_root(fs_info->super_for_commit, - log_root_tree->node->start); - btrfs_set_super_log_root_level(fs_info->super_for_commit, - btrfs_header_level(log_root_tree->node)); - + log_root_start = log_root_tree->node->start; + log_root_level = btrfs_header_level(log_root_tree->node); log_root_tree->log_transid++; mutex_unlock(&log_root_tree->log_mutex); /* - * Nobody else is going to jump in and write the ctree - * super here because the log_commit atomic below is protecting - * us. We must be called with a transaction handle pinning - * the running transaction open, so a full commit can't hop - * in and cause problems either. + * Here we are guaranteed that nobody is going to write the superblock + * for the current transaction before us and that neither we do write + * our superblock before the previous transaction finishes its commit + * and writes its superblock, because: + * + * 1) We are holding a handle on the current transaction, so no body + * can commit it until we release the handle; + * + * 2) Before writing our superblock we acquire the tree_log_mutex, so + * if the previous transaction is still committing, and hasn't yet + * written its superblock, we wait for it to do it, because a + * transaction commit acquires the tree_log_mutex when the commit + * begins and releases it only after writing its superblock. */ + mutex_lock(&fs_info->tree_log_mutex); + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); + btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); + mutex_unlock(&fs_info->tree_log_mutex); if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3298,6 +3319,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree); fs_info->log_root_tree = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); } return 0; } -- cgit v1.2.3 From 44c0ca211a4da92513fffc545b5374b45b0c4fc5 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:16 -0800 Subject: btrfs: lift read-write mount setup from mount and remount Mounting rw and remounting from ro to rw naturally share invariants and functionality which result in a correctly setup rw filesystem. Luckily, there is even a strong unity in the code which implements them. In mount's open_ctree, these operations mostly happen after an early return for ro file systems, and in remount, they happen in a section devoted to remounting ro->rw, after some remount specific validation passes. However, there are unfortunately a few differences. There are small deviations in the order of some of the operations, remount does not start orphan cleanup in root_tree or fs_tree, remount does not create the free space tree, and remount does not handle "one-shot" mount options like clear_cache and uuid tree rescan. Since we want to add building the free space tree to remount, and also to start the same orphan cleanup process on a filesystem mounted as ro then remounted rw, we would benefit from unifying the logic between the two code paths. This patch only lifts the existing common functionality, and leaves a natural path for fixing the discrepancies. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 89 ++++++++++++++++++++++++++++++------------------------ fs/btrfs/disk-io.h | 1 + fs/btrfs/super.c | 37 ++++------------------- 3 files changed, 57 insertions(+), 70 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2d8bcd075c77..c7886584ba7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2870,6 +2870,52 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +/* + * Mounting logic specific to read-write file systems. Shared by open_ctree + * and btrfs_remount when remounting from read-only to read-write. + */ +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) +{ + int ret; + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto out; + + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + btrfs_warn(fs_info, "failed to recover relocation: %d", ret); + goto out; + } + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto out; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto out; + } + + btrfs_qgroup_rescan_resume(fs_info); + + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create the UUID tree %d", ret); + goto out; + } + } + +out: + return ret; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -3285,22 +3331,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_qgroup; - if (!sb_rdonly(sb)) { - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { - btrfs_warn(fs_info, "failed to recover relocation: %d", - ret); - err = -EINVAL; - goto fail_qgroup; - } - } - fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); @@ -3353,35 +3383,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } up_read(&fs_info->cleanup_work_sem); - ret = btrfs_resume_balance_async(fs_info); - if (ret) { - btrfs_warn(fs_info, "failed to resume balance: %d", ret); - close_ctree(fs_info); - return ret; - } - - ret = btrfs_resume_dev_replace_async(fs_info); + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { - btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(fs_info); return ret; } - - btrfs_qgroup_rescan_resume(fs_info); btrfs_discard_resume(fs_info); - if (!fs_info->uuid_root) { - btrfs_info(fs_info, "creating UUID tree"); - ret = btrfs_create_uuid_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create the UUID tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || - fs_info->generation != - btrfs_super_uuid_tree_generation(disk_super)) { + if (fs_info->uuid_root && + (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || + fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 984410144a97..b509d1a490e7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -50,6 +50,7 @@ struct extent_buffer *btrfs_find_create_tree_block( u64 bytenr, u64 owner_root, int level); void btrfs_clean_tree_block(struct extent_buffer *buf); +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 455d924b8d62..49b4a16488c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1885,7 +1885,6 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; unsigned old_flags = sb->s_flags; unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; @@ -1978,39 +1977,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto restore; - - /* recover relocation */ - mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(root); - mutex_unlock(&fs_info->cleaner_mutex); - if (ret) - goto restore; - - ret = btrfs_resume_balance_async(fs_info); + /* + * NOTE: when remounting with a change that does writes, don't + * put it anywhere above this point, as we are not sure to be + * safe to write until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) goto restore; - ret = btrfs_resume_dev_replace_async(fs_info); - if (ret) { - btrfs_warn(fs_info, "failed to resume dev_replace"); - goto restore; - } - - btrfs_qgroup_rescan_resume(fs_info); - - if (!fs_info->uuid_root) { - btrfs_info(fs_info, "creating UUID tree"); - ret = btrfs_create_uuid_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create the UUID tree %d", - ret); - goto restore; - } - } sb->s_flags &= ~SB_RDONLY; set_bit(BTRFS_FS_OPEN, &fs_info->flags); -- cgit v1.2.3 From 8f1c21d7490fc1ac5ef364b7085987ca439fb32f Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:17 -0800 Subject: btrfs: start orphan cleanup on ro->rw remount When we mount a rw filesystem, we start the orphan cleanup process in tree root and filesystem tree. However, when we remount a ro file system rw, we only clean the former. Move the calls to btrfs_orphan_cleanup() on tree_root and fs_root to the shared rw mount routine to effectively add them on ro->rw remount. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c7886584ba7d..0d076437b72e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2882,6 +2882,14 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) goto out; + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { + up_read(&fs_info->cleanup_work_sem); + goto out; + } + up_read(&fs_info->cleanup_work_sem); + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(fs_info->tree_root); mutex_unlock(&fs_info->cleaner_mutex); @@ -3374,15 +3382,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } } - down_read(&fs_info->cleanup_work_sem); - if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || - (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { - up_read(&fs_info->cleanup_work_sem); - close_ctree(fs_info); - return ret; - } - up_read(&fs_info->cleanup_work_sem); - ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { close_ctree(fs_info); -- cgit v1.2.3 From 997e3e2e71b32b31bfab6b299d9db05af285b457 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:18 -0800 Subject: btrfs: only mark bg->needs_free_space if free space tree is on If we attempt to create a free space tree while any block groups have needs_free_space set, we will double add the new free space item and hit EEXIST. Previously, we only created the free space tree on a new mount, so we never hit the case, but if we try to create it on a remount, such block groups could exist and trip us up. We don't do anything with this field unless the free space tree is enabled, so there is no harm in not setting it. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9c3523b6e016..614b6b5b3102 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2165,7 +2165,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - cache->needs_free_space = 1; + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + cache->needs_free_space = 1; ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ -- cgit v1.2.3 From 5011139a4718455a6cd6214fd84e6f8500fd3874 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:19 -0800 Subject: btrfs: create free space tree on ro->rw remount When a user attempts to remount a btrfs filesystem with 'mount -o remount,space_cache=v2', that operation silently succeeds. Unfortunately, this is misleading, because the remount does not create the free space tree. /proc/mounts will incorrectly show space_cache=v2, but on the next mount, the file system will revert to the old space_cache. For now, we handle only the easier case, where the existing mount is read-only and the new mount is read-write. In that case, we can create the free space tree without contending with the block groups changing as we go. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0d076437b72e..a3e0065bf635 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2898,6 +2898,17 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) goto out; } + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "creating free space tree"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); + goto out; + } + } + ret = btrfs_resume_balance_async(fs_info); if (ret) goto out; @@ -3370,18 +3381,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } } - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "creating free space tree"); - ret = btrfs_create_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } - ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { close_ctree(fs_info); -- cgit v1.2.3 From 8cd2908846d11af9b33246171f71a923d35eb3c4 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:20 -0800 Subject: btrfs: clear oneshot options on mount and remount Some options only apply during mount time and are cleared at the end of mount. For now, the example is USEBACKUPROOT, but CLEAR_CACHE also fits the bill, and this is a preparation patch for also clearing that option. One subtlety is that the current code only resets USEBACKUPROOT on rw mounts, but the option is meaningfully "consumed" by a ro mount, so it feels appropriate to clear in that case as well. A subsequent read-write remount would not go through open_ctree, which is the only place that checks the option, so the change should be benign. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++++++++- fs/btrfs/disk-io.h | 1 + fs/btrfs/super.c | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a3e0065bf635..292ea49f5ae3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2870,6 +2870,16 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount + * code paths. + */ +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); +} + /* * Mounting logic specific to read-write file systems. Shared by open_ctree * and btrfs_remount when remounting from read-only to read-write. @@ -3359,7 +3369,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - return 0; + goto clear_oneshot; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { @@ -3402,6 +3412,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } set_bit(BTRFS_FS_OPEN, &fs_info->flags); +clear_oneshot: + btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b509d1a490e7..7b3ecad88d7e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -50,6 +50,7 @@ struct extent_buffer *btrfs_find_create_tree_block( u64 bytenr, u64 owner_root, int level); void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 49b4a16488c9..93514555453b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1999,6 +1999,7 @@ out: wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); + btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; -- cgit v1.2.3 From 8b228324a8ce03083a034dfa784bc10696ce7489 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:21 -0800 Subject: btrfs: clear free space tree on ro->rw remount A user might want to revert to v1 or nospace_cache on a root filesystem, and much like turning on the free space tree, that can only be done remounting from ro->rw. Support clearing the free space tree on such mounts by moving it into the shared remount logic. Since the CLEAR_CACHE option sticks around across remounts, this change would result in clearing the tree for ever on every remount, which is not desirable. To fix that, add CLEAR_CACHE to the oneshot options we clear at mount end, which has the other bonus of not cluttering the /proc/mounts output with clear_cache. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 292ea49f5ae3..77c7013f69d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2878,6 +2878,7 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); } /* @@ -2887,6 +2888,26 @@ void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; + bool clear_free_space_tree = false; + + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = true; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = true; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + goto out; + } + } ret = btrfs_cleanup_fs_roots(fs_info); if (ret) @@ -2960,7 +2981,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *chunk_root; int ret; int err = -EINVAL; - int clear_free_space_tree = 0; int level; ret = init_mount_fs_info(fs_info, sb); @@ -3371,26 +3391,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sb_rdonly(sb)) goto clear_oneshot; - if (btrfs_test_opt(fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = 1; - } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { - btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = 1; - } - - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } - ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { close_ctree(fs_info); -- cgit v1.2.3 From 948462294577a3870c407c16d89bb2314f0b0cfb Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:22 -0800 Subject: btrfs: keep sb cache_generation consistent with space_cache When mounting, btrfs uses the cache_generation in the super block to determine if space cache v1 is in use. However, by mounting with nospace_cache or space_cache=v2, it is possible to disable space cache v1, which does not result in un-setting cache_generation back to 0. In order to base some logic, like mount option printing in /proc/mounts, on the current state of the space cache rather than just the values of the mount option, keep the value of cache_generation consistent with the status of space cache v1. We ensure that cache_generation > 0 iff the file system is using space_cache v1. This requires committing a transaction on any mount which changes whether we are using v1. (v1->nospace_cache, v1->v2, nospace_cache->v1, v2->v1). Since the mechanism for writing out the cache generation is transaction commit, but we want some finer grained control over when we un-set it, we can't just rely on the SPACE_CACHE mount option, and introduce an fs_info flag that mount can use when it wants to unset the generation. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 8 ++++++++ fs/btrfs/free-space-cache.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/super.c | 10 +++++++--- fs/btrfs/transaction.c | 2 ++ 6 files changed, 50 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 331554fd6fd1..2744e13e8eb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -559,6 +559,9 @@ enum { /* Indicate that the discard workqueue can service discards. */ BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 77c7013f69d7..2dcc5e52140c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2888,6 +2888,7 @@ void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); bool clear_free_space_tree = false; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && @@ -2940,6 +2941,12 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { + ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); + if (ret) + goto out; + } + ret = btrfs_resume_balance_async(fs_info); if (ret) goto out; @@ -3410,6 +3417,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device return ret; } } + set_bit(BTRFS_FS_OPEN, &fs_info->flags); clear_oneshot: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fe4b2176a6e8..e71def319655 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3763,6 +3763,34 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, return ret; } +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) +{ + return btrfs_super_cache_generation(fs_info->super_copy); +} + +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * update_super_roots will appropriately set + * super_copy->cache_generation based on the SPACE_CACHE option, so all + * we have to do is trigger a transaction commit. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (!active) + set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + + ret = btrfs_commit_transaction(trans); + clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + + return ret; +} + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 91d495e31a08..3eff615c30b8 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -135,6 +135,8 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async); +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group *cache, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 93514555453b..b2252184305c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -552,7 +552,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, { substring_t args[MAX_OPT_ARGS]; char *p, *num; - u64 cache_gen; int intarg; int ret = 0; char *compress_type; @@ -562,10 +561,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, bool saved_compress_force; int no_compress = 0; - cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (cache_gen) { + else if (btrfs_free_space_cache_v1_active(info)) { if (btrfs_is_zoned(info)) { btrfs_info(info, "zoned: clearing existing space cache"); @@ -1864,6 +1862,8 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + /* * We need to cleanup all defragable inodes if the autodefragment is * close or the filesystem is read only. @@ -1880,6 +1880,10 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_cleanup(fs_info); + + /* If we toggled space cache */ + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6b7156712486..8e0f7a1029c6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1796,6 +1796,8 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->root_level = root_item->level; if (btrfs_test_opt(fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; + else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) + super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; } -- cgit v1.2.3 From 04c415596953ec90fdae1ad388fdc8151d5dfdc1 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:23 -0800 Subject: btrfs: use superblock state to print space_cache mount option To make the contents of /proc/mounts better match the actual state of the filesystem, base the display of the space cache mount options off the contents of the super block rather than the last mount options passed in. Since there are many scenarios where the mount will ignore a space cache option, simply showing the passed in option is misleading. For example, if we mount with -o remount,space_cache=v2 on a read-write file system without an existing free space tree, we won't build a free space tree, but /proc/mounts will read space_cache=v2 (until we mount again and it goes away) cache_generation is set iff space_cache=v1, FREE_SPACE_TREE is set iff space_cache=v2, and if neither is the case, we print nospace_cache. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b2252184305c..bd3ce2b43838 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1483,9 +1483,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(info, SPACE_CACHE)) + if (btrfs_free_space_cache_v1_active(info)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(info, FREE_SPACE_TREE)) + else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); -- cgit v1.2.3 From 2838d255cb9b85a845efc3bbd3f6fc66ed883d35 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:24 -0800 Subject: btrfs: warn when remount will not change the free space tree If the remount is ro->ro, rw->ro, or rw->rw, we will not create or clear the free space tree. This can be surprising, so print a warning to dmesg to make the failure more visible. It is also important to ensure that the space cache options (SPACE_CACHE, FREE_SPACE_TREE) are consistent, so ensure those are set to properly match the current on disk state (which won't be changing). Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bd3ce2b43838..022f20810089 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1919,6 +1919,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from ro to rw"); + /* Make sure free space cache options match the state on disk */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } + } + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; -- cgit v1.2.3 From 36b216c85eb9d7f59ac1cb8b117376e20acc6cbc Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:25 -0800 Subject: btrfs: remove free space items when disabling space cache v1 When the filesystem transitions from space cache v1 to v2 or to nospace_cache, it removes the old cached data, but does not remove the FREE_SPACE items nor the free space inodes they point to. This doesn't cause any issues besides being a bit inefficient, since these items no longer do anything useful. To fix it, when we are mounting, and plan to disable the space cache, destroy each block group's free space item and free space inode. The code to remove the items is lifted from the existing use case of removing the block group, with a light adaptation to handle whether or not we have already looked up the free space inode. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 39 +----------------- fs/btrfs/free-space-cache.c | 99 +++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/free-space-cache.h | 3 ++ 3 files changed, 100 insertions(+), 41 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 614b6b5b3102..fbc74bb41322 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -848,8 +848,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_key key; struct inode *inode; struct kobject *kobj = NULL; int ret; @@ -927,42 +925,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&trans->transaction->dirty_bgs_lock); mutex_unlock(&trans->transaction->cache_write_mutex); - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); - } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); - } - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.type = 0; - key.offset = block_group->start; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + ret = btrfs_remove_free_space_inode(trans, inode, block_group); + if (ret) goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); - } spin_lock(&fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e71def319655..4d8897879c9c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -206,6 +206,65 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, ino, block_group->start); } +/* + * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode + * handles lookup, otherwise it takes ownership and iputs the inode. + * Don't reuse an inode pointer after passing it into this function. + */ +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!inode) + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) != -ENOENT) + ret = PTR_ERR(inode); + goto out; + } + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for the lookup ref */ + btrfs_add_delayed_iput(inode); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.type = 0; + key.offset = block_group->start; + ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, + -1, 1); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); +out: + btrfs_free_path(path); + return ret; +} + int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { @@ -3768,24 +3827,56 @@ bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) return btrfs_super_cache_generation(fs_info->super_copy); } +static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + struct btrfs_block_group *block_group; + struct rb_node *node; + int ret; + + btrfs_info(fs_info, "cleaning free space cache v1"); + + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group, cache_node); + ret = btrfs_remove_free_space_inode(trans, NULL, block_group); + if (ret) + goto out; + node = rb_next(node); + } +out: + return ret; +} + int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) { struct btrfs_trans_handle *trans; int ret; /* - * update_super_roots will appropriately set - * super_copy->cache_generation based on the SPACE_CACHE option, so all - * we have to do is trigger a transaction commit. + * update_super_roots will appropriately set or unset + * super_copy->cache_generation based on SPACE_CACHE and + * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a + * transaction commit whether we are enabling space cache v1 and don't + * have any other work to do, or are disabling it and removing free + * space inodes. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); - if (!active) + if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + ret = cleanup_free_space_cache_v1(fs_info, trans); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); +out: clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); return ret; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 3eff615c30b8..ecb09a02d544 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -82,6 +82,9 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group); int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -- cgit v1.2.3 From af456a2c0aaaff15b84f046e2545570bf1bf50ed Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:26 -0800 Subject: btrfs: skip space_cache v1 setup when not using it If we are not using space cache v1, we should not create the free space object or free space inodes. This comes up when we delete the existing free space objects/inodes when migrating to v2, only to see them get recreated for every dirtied block group. Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fbc74bb41322..52f2198d44c9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2340,6 +2340,9 @@ static int cache_save_setup(struct btrfs_block_group *block_group, int retries = 0; int ret = 0; + if (!btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + /* * If this block group is smaller than 100 megs don't bother caching the * block group. -- cgit v1.2.3 From 8a6a87cd449b9840f8169e0ece0a8fa11232723d Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 18 Nov 2020 15:06:27 -0800 Subject: btrfs: fix lockdep warning when creating free space tree A lock dependency loop exists between the root tree lock, the extent tree lock, and the free space tree lock. The root tree lock depends on the free space tree lock because btrfs_create_tree holds the new tree's lock while adding it to the root tree. The extent tree lock depends on the root tree lock because during umount, we write out space cache v1, which writes inodes in the root tree, which results in holding the root tree lock while doing a lookup in the extent tree. Finally, the free space tree depends on the extent tree because populate_free_space_tree holds a locked path in the extent tree and then does a lookup in the free space tree to add the new item. The simplest of the three to break is the one during tree creation: we unlock the leaf before inserting the tree node into the root tree, which fixes the lockdep warning. [30.480136] ====================================================== [30.480830] WARNING: possible circular locking dependency detected [30.481457] 5.9.0-rc8+ #76 Not tainted [30.481897] ------------------------------------------------------ [30.482500] mount/520 is trying to acquire lock: [30.483064] ffff9babebe03908 (btrfs-free-space-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 [30.484054] but task is already holding lock: [30.484637] ffff9babebe24468 (btrfs-extent-01#2){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 [30.485581] which lock already depends on the new lock. [30.486397] the existing dependency chain (in reverse order) is: [30.487205] -> #2 (btrfs-extent-01#2){++++}-{3:3}: [30.487825] down_read_nested+0x43/0x150 [30.488306] __btrfs_tree_read_lock+0x39/0x180 [30.488868] __btrfs_read_lock_root_node+0x3a/0x50 [30.489477] btrfs_search_slot+0x464/0x9b0 [30.490009] check_committed_ref+0x59/0x1d0 [30.490603] btrfs_cross_ref_exist+0x65/0xb0 [30.491108] run_delalloc_nocow+0x405/0x930 [30.491651] btrfs_run_delalloc_range+0x60/0x6b0 [30.492203] writepage_delalloc+0xd4/0x150 [30.492688] __extent_writepage+0x18d/0x3a0 [30.493199] extent_write_cache_pages+0x2af/0x450 [30.493743] extent_writepages+0x34/0x70 [30.494231] do_writepages+0x31/0xd0 [30.494642] __filemap_fdatawrite_range+0xad/0xe0 [30.495194] btrfs_fdatawrite_range+0x1b/0x50 [30.495677] __btrfs_write_out_cache+0x40d/0x460 [30.496227] btrfs_write_out_cache+0x8b/0x110 [30.496716] btrfs_start_dirty_block_groups+0x211/0x4e0 [30.497317] btrfs_commit_transaction+0xc0/0xba0 [30.497861] sync_filesystem+0x71/0x90 [30.498303] btrfs_remount+0x81/0x433 [30.498767] reconfigure_super+0x9f/0x210 [30.499261] path_mount+0x9d1/0xa30 [30.499722] do_mount+0x55/0x70 [30.500158] __x64_sys_mount+0xc4/0xe0 [30.500616] do_syscall_64+0x33/0x40 [30.501091] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [30.501629] -> #1 (btrfs-root-00){++++}-{3:3}: [30.502241] down_read_nested+0x43/0x150 [30.502727] __btrfs_tree_read_lock+0x39/0x180 [30.503291] __btrfs_read_lock_root_node+0x3a/0x50 [30.503903] btrfs_search_slot+0x464/0x9b0 [30.504405] btrfs_insert_empty_items+0x60/0xa0 [30.504973] btrfs_insert_item+0x60/0xd0 [30.505412] btrfs_create_tree+0x1b6/0x210 [30.505913] btrfs_create_free_space_tree+0x54/0x110 [30.506460] btrfs_mount_rw+0x15d/0x20f [30.506937] btrfs_remount+0x356/0x433 [30.507369] reconfigure_super+0x9f/0x210 [30.507868] path_mount+0x9d1/0xa30 [30.508264] do_mount+0x55/0x70 [30.508668] __x64_sys_mount+0xc4/0xe0 [30.509186] do_syscall_64+0x33/0x40 [30.509652] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [30.510271] -> #0 (btrfs-free-space-00){++++}-{3:3}: [30.510972] __lock_acquire+0x11ad/0x1b60 [30.511432] lock_acquire+0xa2/0x360 [30.511917] down_read_nested+0x43/0x150 [30.512383] __btrfs_tree_read_lock+0x39/0x180 [30.512947] __btrfs_read_lock_root_node+0x3a/0x50 [30.513455] btrfs_search_slot+0x464/0x9b0 [30.513947] search_free_space_info+0x45/0x90 [30.514465] __add_to_free_space_tree+0x92/0x39d [30.515010] btrfs_create_free_space_tree.cold.22+0x1ee/0x45d [30.515639] btrfs_mount_rw+0x15d/0x20f [30.516142] btrfs_remount+0x356/0x433 [30.516538] reconfigure_super+0x9f/0x210 [30.517065] path_mount+0x9d1/0xa30 [30.517438] do_mount+0x55/0x70 [30.517824] __x64_sys_mount+0xc4/0xe0 [30.518293] do_syscall_64+0x33/0x40 [30.518776] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [30.519335] other info that might help us debug this: [30.520210] Chain exists of: btrfs-free-space-00 --> btrfs-root-00 --> btrfs-extent-01#2 [30.521407] Possible unsafe locking scenario: [30.522037] CPU0 CPU1 [30.522456] ---- ---- [30.522941] lock(btrfs-extent-01#2); [30.523311] lock(btrfs-root-00); [30.523952] lock(btrfs-extent-01#2); [30.524620] lock(btrfs-free-space-00); [30.525068] *** DEADLOCK *** [30.525669] 5 locks held by mount/520: [30.526116] #0: ffff9babebc520e0 (&type->s_umount_key#37){+.+.}-{3:3}, at: path_mount+0x7ef/0xa30 [30.527056] #1: ffff9babebc52640 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x3d5/0x5c0 [30.527960] #2: ffff9babeae8f2e8 (&cache->free_space_lock#2){+.+.}-{3:3}, at: btrfs_create_free_space_tree.cold.22+0x101/0x45d [30.529118] #3: ffff9babebe24468 (btrfs-extent-01#2){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 [30.530113] #4: ffff9babebd52eb8 (btrfs-extent-00){++++}-{3:3}, at: btrfs_try_tree_read_lock+0x16/0x100 [30.531124] stack backtrace: [30.531528] CPU: 0 PID: 520 Comm: mount Not tainted 5.9.0-rc8+ #76 [30.532166] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-4.module_el8.1.0+248+298dec18 04/01/2014 [30.533215] Call Trace: [30.533452] dump_stack+0x8d/0xc0 [30.533797] check_noncircular+0x13c/0x150 [30.534233] __lock_acquire+0x11ad/0x1b60 [30.534667] lock_acquire+0xa2/0x360 [30.535063] ? __btrfs_tree_read_lock+0x39/0x180 [30.535525] down_read_nested+0x43/0x150 [30.535939] ? __btrfs_tree_read_lock+0x39/0x180 [30.536400] __btrfs_tree_read_lock+0x39/0x180 [30.536862] __btrfs_read_lock_root_node+0x3a/0x50 [30.537304] btrfs_search_slot+0x464/0x9b0 [30.537713] ? trace_hardirqs_on+0x1c/0xf0 [30.538148] search_free_space_info+0x45/0x90 [30.538572] __add_to_free_space_tree+0x92/0x39d [30.539071] ? printk+0x48/0x4a [30.539367] btrfs_create_free_space_tree.cold.22+0x1ee/0x45d [30.539972] btrfs_mount_rw+0x15d/0x20f [30.540350] btrfs_remount+0x356/0x433 [30.540773] ? shrink_dcache_sb+0xd9/0x100 [30.541203] reconfigure_super+0x9f/0x210 [30.541642] path_mount+0x9d1/0xa30 [30.542040] do_mount+0x55/0x70 [30.542366] __x64_sys_mount+0xc4/0xe0 [30.542822] do_syscall_64+0x33/0x40 [30.543197] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [30.543691] RIP: 0033:0x7f109f7ab93a [30.546042] RSP: 002b:00007ffc47c4f858 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [30.546770] RAX: ffffffffffffffda RBX: 00007f109f8cf264 RCX: 00007f109f7ab93a [30.547485] RDX: 0000557e6fc10770 RSI: 0000557e6fc19cf0 RDI: 0000557e6fc19cd0 [30.548185] RBP: 0000557e6fc10520 R08: 0000557e6fc18e30 R09: 0000557e6fc18cb0 [30.548911] R10: 0000000000200020 R11: 0000000000000246 R12: 0000000000000000 [30.549606] R13: 0000557e6fc19cd0 R14: 0000557e6fc10770 R15: 0000557e6fc10520 Reviewed-by: Josef Bacik Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2dcc5e52140c..46dd9e0b077e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1142,7 +1142,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail; + goto fail_unlock; } root->node = leaf; @@ -1166,6 +1166,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, export_guid(root->root_item.uuid, &guid_null); btrfs_set_root_drop_level(&root->root_item, 0); + btrfs_tree_unlock(leaf); + key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = 0; @@ -1173,13 +1175,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_tree_unlock(leaf); - return root; -fail: +fail_unlock: if (leaf) btrfs_tree_unlock(leaf); +fail: btrfs_put_root(root); return ERR_PTR(ret); -- cgit v1.2.3 From 1941b64b080b45a80796a9f3a2e5c89554e53bdf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:47:57 +0800 Subject: btrfs: rename bio_offset of extent_submit_bio_start_t to dio_file_offset The parameter bio_offset of extent_submit_bio_start_t is very confusing. If it's really bio_offset (offset to bio), then it should be u32. But in fact, it's only utilized by dio read, and that member is used as file offset, which must be u64. Rename it to dio_file_offset since the only user uses it as file offset, and add comment for who is using it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 17 ++++++++--------- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 10 +++++----- 4 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 46dd9e0b077e..765deefda92b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -113,11 +113,9 @@ struct async_submit_bio { struct bio *bio; extent_submit_bio_start_t *submit_bio_start; int mirror_num; - /* - * bio_offset is optional, can be used if the pages in the bio - * can't tell us where in the file the bio should go - */ - u64 bio_offset; + + /* Optional parameter for submit_bio_start used by direct io */ + u64 dio_file_offset; struct btrfs_work work; blk_status_t status; }; @@ -697,7 +695,8 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, async->bio_offset); + ret = async->submit_bio_start(async->inode, async->bio, + async->dio_file_offset); if (ret) async->status = ret; } @@ -749,7 +748,7 @@ static void run_one_async_free(struct btrfs_work *work) blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, - u64 bio_offset, + u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -767,7 +766,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - async->bio_offset = bio_offset; + async->dio_file_offset = dio_file_offset; async->status = 0; @@ -797,7 +796,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) } static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 bio_offset) + u64 dio_file_offset) { /* * when we're called for a write, we're already in the async diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7b3ecad88d7e..e45057c0c016 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -116,7 +116,7 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, - u64 bio_offset, + u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 66762c3cdf81..96503bc2bdd6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,7 +72,7 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, unsigned long bio_flags); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, - struct bio *bio, u64 bio_offset); + struct bio *bio, u64 dio_file_offset); #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0ce42d52d53e..4cd09a9d2029 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2212,7 +2212,7 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, * are inserted into the btree */ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 bio_offset) + u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } @@ -7795,9 +7795,10 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, - struct bio *bio, u64 offset) + struct bio *bio, + u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) @@ -7846,8 +7847,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, - file_offset, + ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { -- cgit v1.2.3 From 7ffd27e378d2541059b9ba49868c32d90ad5ae91 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:47:58 +0800 Subject: btrfs: pass bio_offset to check_data_csum() directly Parameter icsum for check_data_csum() is a little hard to understand. So is the phy_offset for btrfs_verify_data_csum(). Both parameters are calculated values for csum lookup. Instead of some calculated value, just pass bio_offset and let the final and only user, check_data_csum(), calculate whatever it needs. Since we are here, also make the bio_offset parameter and some related variables to be u32 (unsigned int). As bio size is limited by its bi_size, which is unsigned int, and has extra size limit check during various bio operations. Thus we are ensured that bio_offset won't overflow u32. Thus for all involved functions, not only rename the parameter from @phy_offset to @bio_offset, but also reduce its width to u32, so we won't have suspicious "u32 = u64 >> sector_bits;" lines anymore. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_io.c | 32 ++++++++++++++++++++------------ fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 32 ++++++++++++++++++++------------ 4 files changed, 42 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2744e13e8eb9..112c9a2ae47b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3052,7 +3052,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 569d50ccf78a..a28b61510265 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2634,7 +2634,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) } blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u64 phy_offset, + struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, u64 start, u64 end, int failed_mirror, submit_bio_hook_t *submit_bio_hook) @@ -2644,7 +2644,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); - const int icsum = phy_offset >> fs_info->sectorsize_bits; + const int icsum = bio_offset >> fs_info->sectorsize_bits; bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; @@ -2869,10 +2869,11 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; - u64 offset = 0; - u64 start; - u64 end; - u64 len; + /* + * The offset to the beginning of a bio, since one bio can never be + * larger than UINT_MAX, u32 here is enough. + */ + u32 bio_offset = 0; int mirror; int ret; struct bvec_iter_all iter_all; @@ -2882,7 +2883,10 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u32 sectorsize = fs_info->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + u64 start; + u64 end; + u32 len; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2915,8 +2919,9 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, offset, page, - start, end, mirror); + ret = btrfs_verify_data_csum(io_bio, + bio_offset, page, start, end, + mirror); else ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); @@ -2944,12 +2949,14 @@ static void end_bio_extent_readpage(struct bio *bio) * If it can't handle the error it will return -EIO and * we remain responsible for that page. */ - if (!btrfs_submit_read_repair(inode, bio, offset, page, + if (!btrfs_submit_read_repair(inode, bio, bio_offset, + page, start - page_offset(page), start, end, mirror, btrfs_submit_data_bio)) { uptodate = !bio->bi_status; - offset += len; + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; continue; } } else { @@ -2974,7 +2981,8 @@ readpage_ok: if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); } - offset += len; + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; /* Update page status and unlock */ endio_readpage_update_page_status(page, uptodate); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 96503bc2bdd6..16f2ce5cd8ed 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -291,7 +291,7 @@ struct io_failure_record { blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u64 phy_offset, + struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, u64 start, u64 end, int failed_mirror, submit_bio_hook_t *submit_bio_hook); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4cd09a9d2029..e07bc63749ab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2942,28 +2942,30 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, /* * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: the inode + * @inode: inode * @io_bio: btrfs_io_bio which contains the csum - * @icsum: checksum index in the io_bio->csum array, size of csum_size + * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page * * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - int icsum, struct page *page, int pgoff) + u32 bio_offset, struct page *page, int pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; u32 len = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; + unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; + offset_sectors = bio_offset >> fs_info->sectorsize_bits; + csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; @@ -2988,11 +2990,16 @@ zeroit: } /* - * when reads are done, we need to check csums to verify the data is correct + * When reads are done, we need to check csums to verify the data is correct. * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. + * + * @bio_offset: offset to the beginning of the bio (in bytes) + * @start: file offset of the range start + * @end: file offset of the range end (inclusive) + * @mirror: mirror number */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); @@ -3017,8 +3024,7 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, return 0; } - phy_offset >>= root->fs_info->sectorsize_bits; - return check_data_csum(inode, io_bio, phy_offset, page, offset); + return check_data_csum(inode, io_bio, bio_offset, page, offset); } /* @@ -7712,7 +7718,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, struct bio_vec bvec; struct bvec_iter iter; u64 start = io_bio->logical; - int icsum = 0; + u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { @@ -7723,8 +7729,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && - (!csum || !check_data_csum(inode, io_bio, icsum, - bvec.bv_page, pgoff))) { + (!csum || !check_data_csum(inode, io_bio, + bio_offset, bvec.bv_page, pgoff))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), @@ -7732,6 +7738,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, } else { blk_status_t status; + ASSERT((start - io_bio->logical) < UINT_MAX); status = btrfs_submit_read_repair(inode, &io_bio->bio, start - io_bio->logical, @@ -7744,7 +7751,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, err = status; } start += sectorsize; - icsum++; + ASSERT(bio_offset + sectorsize > bio_offset); + bio_offset += sectorsize; pgoff += sectorsize; } } -- cgit v1.2.3 From f44cf41075b05660d61efa7bfa8350b45286f065 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:47:59 +0800 Subject: btrfs: make btrfs_verify_data_csum follow sector size Currently btrfs_verify_data_csum() just passes the whole page to check_data_csum(), which is fine since we only support sectorsize == PAGE_SIZE. To support subpage, we need to properly honor per-sector checksum verification, just like what we did in dio read path. This patch will do the csum verification in a for loop, starts with pg_off == start - page_offset(page), with sectorsize increase for each loop. For sectorsize == PAGE_SIZE case, the pg_off will always be 0, and we will only loop once. For subpage case, we do the iterate over each sector and if we found any error, we return error. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e07bc63749ab..af19a9aef1e5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2951,7 +2951,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - u32 bio_offset, struct page *page, int pgoff) + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -3002,10 +3002,11 @@ zeroit: int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end, int mirror) { - size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; + const u32 sectorsize = root->fs_info->sectorsize; + u32 pg_off; if (PageChecked(page)) { ClearPageChecked(page); @@ -3024,7 +3025,18 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, return 0; } - return check_data_csum(inode, io_bio, bio_offset, page, offset); + ASSERT(page_offset(page) <= start && + end <= page_offset(page) + PAGE_SIZE - 1); + for (pg_off = offset_in_page(start); + pg_off < offset_in_page(end); + pg_off += sectorsize, bio_offset += sectorsize) { + int ret; + + ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); + if (ret < 0) + return -EIO; + } + return 0; } /* -- cgit v1.2.3 From f91e0d0c4cd986af54a8b2deb43b9f7b35299a65 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:00 +0800 Subject: btrfs: factor out btree page submission code to a helper In btree_write_cache_pages() we have a btree page submission routine buried deeply in a nested loop. This patch will extract that part of code into a helper function, submit_eb_page(), to do the same work. Since submit_eb_page() now can return >0 for successful extent buffer submission, remove the "ASSERT(ret <= 0);" line. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 122 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 47 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a28b61510265..e70d6944d075 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3987,10 +3987,81 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, return ret; } +/* + * Submit all page(s) of one extent buffer. + * + * @page: the page of one extent buffer + * @eb_context: to determine if we need to submit this page, if current page + * belongs to this eb, we don't need to submit + * + * The caller should pass each page in their bytenr order, and here we use + * @eb_context to determine if we have submitted pages of one extent buffer. + * + * If we have, we just skip until we hit a new page that doesn't belong to + * current @eb_context. + * + * If not, we submit all the page(s) of the extent buffer. + * + * Return >0 if we have submitted the extent buffer successfully. + * Return 0 if we don't need to submit the page, as it's already submitted by + * previous call. + * Return <0 for fatal error. + */ +static int submit_eb_page(struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + struct extent_buffer **eb_context) +{ + struct address_space *mapping = page->mapping; + struct extent_buffer *eb; + int ret; + + if (!PagePrivate(page)) + return 0; + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON but no point + * crashing the machine for something we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + if (eb == *eb_context) { + spin_unlock(&mapping->private_lock); + return 0; + } + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + return 0; + + *eb_context = eb; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret <= 0) { + free_extent_buffer(eb); + return ret; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + return ret; + return 1; +} + int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb, *prev_eb = NULL; + struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { .bio = NULL, .extent_locked = 0, @@ -4036,55 +4107,13 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (!PagePrivate(page)) - continue; - - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); - continue; - } - - eb = (struct extent_buffer *)page->private; - - /* - * Shouldn't happen and normally this would be a BUG_ON - * but no sense in crashing the users box for something - * we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); - continue; - } - - if (eb == prev_eb) { - spin_unlock(&mapping->private_lock); + ret = submit_eb_page(page, wbc, &epd, &eb_context); + if (ret == 0) continue; - } - - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); - if (!ret) - continue; - - prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, &epd); - if (!ret) { - free_extent_buffer(eb); - continue; - } else if (ret < 0) { - done = 1; - free_extent_buffer(eb); - break; - } - - ret = write_one_eb(eb, wbc, &epd); - if (ret) { + if (ret < 0) { done = 1; - free_extent_buffer(eb); break; } - free_extent_buffer(eb); /* * the filesystem may choose to bump up nr_to_write. @@ -4105,7 +4134,6 @@ retry: index = 0; goto retry; } - ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); return ret; -- cgit v1.2.3 From deb678955360ea87605b8aea1f69c45bddc3f867 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:01 +0800 Subject: btrfs: calculate inline extent buffer page size based on page size Btrfs only support 64K as maximum node size, thus for 4K page system, we would have at most 16 pages for one extent buffer. For a system using 64K page size, we would really have just one page. While we always use 16 pages for extent_buffer::pages, this means for systems using 64K pages, we are wasting memory for 15 page pointers which will never be used. Calculate the array size based on page size and the node size maximum. - for systems using 4K page size, it will stay 16 pages - for systems using 64K page size, it will be 1 page Move the definition of BTRFS_MAX_METADATA_BLOCKSIZE to btrfs_tree.h, to avoid circular inclusion of ctree.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent_io.c | 7 +------ fs/btrfs/extent_io.h | 4 ++-- include/uapi/linux/btrfs_tree.h | 3 ++- 4 files changed, 5 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 112c9a2ae47b..c5ef29078954 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -67,12 +67,6 @@ struct btrfs_ref; #define BTRFS_OLDEST_GENERATION 0ULL -/* - * the max metadata block size. This limit is somewhat artificial, - * but the memmove costs go through the roof for larger blocks. - */ -#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 - /* * we can actually store much bigger names, but lets not confuse the rest * of linux diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e70d6944d075..86b2a483c1ab 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5053,12 +5053,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - /* - * Sanity checks, currently the maximum is 64k covered by 16x 4k pages - */ - BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE - > MAX_INLINE_EXTENT_BUFFER_SIZE); - BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); + ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 16f2ce5cd8ed..77f2211550e3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "ulist.h" /* @@ -74,8 +75,7 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); -#define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) +#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; unsigned long len; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 6b885982ece6..58d7cff9afb1 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -299,7 +299,8 @@ */ #define BTRFS_STRING_ITEM_KEY 253 - +/* Maximum metadata block size (nodesize) */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 /* 32 bytes in various csum fields */ #define BTRFS_CSUM_SIZE 32 -- cgit v1.2.3 From 1aaac38c83a23cd31df551b3f84d3c7f5067a7fe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:02 +0800 Subject: btrfs: don't allow tree block to cross page boundary for subpage support As a preparation for subpage sector size support (allowing filesystem with sector size smaller than page size to be mounted) if the sector size is smaller than page size, we don't allow tree block to be read if it crosses 64K(*) boundary. The 64K is selected because: - we are only going to support 64K page size for subpage for now - 64K is also the maximum supported node size This ensures that tree blocks are always contained in one page for a system with 64K page size, which can greatly simplify the handling. Otherwise we would have to do complex multi-page handling of tree blocks. Currently there is no way to create such tree blocks. In kernel we have avoided such tree blocks allocation even on 4K page size, as it can lead to RAID56 stripe scrubbing. While btrfs-progs have fixed its chunk allocator since 2016 for convert, and has extra checks to do the same behavior as the kernel. Just add such graceful checks in case of an ancient filesystem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 86b2a483c1ab..76e0b5ed75dd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5273,6 +5273,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } + if (fs_info->sectorsize < PAGE_SIZE && + offset_in_page(start) + len > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %lu", + start, len); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; -- cgit v1.2.3 From 4a3dc93843dd6ee17c68231d6a90c76231cb65fc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:03 +0800 Subject: btrfs: update num_extent_pages to support subpage sized extent buffer For subpage sized extent buffer, we have ensured no extent buffer will cross page boundary, thus we would only need one page for any extent buffer. Update function num_extent_pages to handle such case. Now num_extent_pages() returns 1 for subpage sized extent buffer. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 77f2211550e3..19221095c635 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -203,8 +203,14 @@ void btrfs_readahead_node_child(struct extent_buffer *node, int slot); static inline int num_extent_pages(const struct extent_buffer *eb) { - return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) - - (eb->start >> PAGE_SHIFT); + /* + * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to + * sectorsize, it's just eb->len >> PAGE_SHIFT. + * + * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE, + * thus have to ensure we get at least one page. + */ + return (eb->len >> PAGE_SHIFT) ?: 1; } static inline int extent_buffer_uptodate(const struct extent_buffer *eb) -- cgit v1.2.3 From 884b07d0f4f7e09d8312008fed04e01d9d2270dc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:04 +0800 Subject: btrfs: handle sectorsize < PAGE_SIZE case for extent buffer accessors To support sectorsize < PAGE_SIZE case, we need to take extra care of extent buffer accessors. Since sectorsize is smaller than PAGE_SIZE, one page can contain multiple tree blocks, we must use eb->start to determine the real offset to read/write for extent buffer accessors. This patch introduces two helpers to do this: - get_eb_page_index() This is to calculate the index to access extent_buffer::pages. It's just a simple wrapper around "start >> PAGE_SHIFT". For sectorsize == PAGE_SIZE case, nothing is changed. For sectorsize < PAGE_SIZE case, we always get index as 0, and the existing page shift also works. - get_eb_offset_in_page() This is to calculate the offset to access extent_buffer::pages. This needs to take extent_buffer::start into consideration. For sectorsize == PAGE_SIZE case, extent_buffer::start is always aligned to PAGE_SIZE, thus adding extent_buffer::start to offset_in_page() won't change the result. For sectorsize < PAGE_SIZE case, adding extent_buffer::start gives us the correct offset to access. This patch will touch the following parts to cover all extent buffer accessors: - BTRFS_SETGET_HEADER_FUNCS() - read_extent_buffer() - read_extent_buffer_to_user() - memcmp_extent_buffer() - write_extent_buffer_chunk_tree_uuid() - write_extent_buffer_fsid() - write_extent_buffer() - memzero_extent_buffer() - copy_extent_buffer_full() - copy_extent_buffer() - memcpy_extent_buffer() - memmove_extent_buffer() - btrfs_get_token_##bits() - btrfs_get_##bits() - btrfs_set_token_##bits() - btrfs_set_##bits() - generic_bin_search() Signed-off-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 3 ++- fs/btrfs/ctree.h | 38 +++++++++++++++++++++++++++-- fs/btrfs/extent_io.c | 64 ++++++++++++++++++++++++++++--------------------- fs/btrfs/struct-funcs.c | 18 +++++++------- 4 files changed, 85 insertions(+), 38 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e5a0941c4bde..07810891e204 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1683,9 +1683,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, oip = offset_in_page(offset); if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = offset >> PAGE_SHIFT; + const unsigned long idx = get_eb_page_index(offset); char *kaddr = page_address(eb->pages[idx]); + oip = get_eb_offset_in_page(eb, offset); tmp = (struct btrfs_disk_key *)(kaddr + oip); } else { read_extent_buffer(eb, &unaligned, offset, key_size); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5ef29078954..98d6871b2295 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1552,13 +1552,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]); \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]); \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } @@ -3366,6 +3367,39 @@ static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif +/* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + /* * Use that for functions that are conditionally exported for sanity tests but * otherwise static diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76e0b5ed75dd..6e3b72e63e42 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5703,12 +5703,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5733,13 +5733,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5768,13 +5768,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5800,7 +5800,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); } @@ -5810,7 +5810,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); } @@ -5823,12 +5823,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5852,12 +5852,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5881,10 +5881,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); + if (dst->fs_info->sectorsize == PAGE_SIZE) { + num_pages = num_extent_pages(dst); + for (i = 0; i < num_pages; i++) + copy_page(page_address(dst->pages[i]), + page_address(src->pages[i])); + } else { + size_t src_offset = get_eb_offset_in_page(src, 0); + size_t dst_offset = get_eb_offset_in_page(dst, 0); + + ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + memcpy(page_address(dst->pages[0]) + dst_offset, + page_address(src->pages[0]) + src_offset, + src->len); + } } void copy_extent_buffer(const struct extent_buffer *dst, @@ -5897,7 +5907,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, size_t offset; struct page *page; char *kaddr; - unsigned long i = dst_offset >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -5905,7 +5915,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = offset_in_page(dst_offset); + offset = get_eb_offset_in_page(dst, dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5949,7 +5959,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + byte_offset; + offset = start + offset_in_page(eb->start) + byte_offset; *page_index = offset >> PAGE_SHIFT; *page_offset = offset_in_page(offset); @@ -6103,11 +6113,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, return; while (len > 0) { - dst_off_in_page = offset_in_page(dst_offset); - src_off_in_page = offset_in_page(src_offset); + dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); + src_off_in_page = get_eb_offset_in_page(dst, src_offset); - dst_i = dst_offset >> PAGE_SHIFT; - src_i = src_offset >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_offset); + src_i = get_eb_page_index(src_offset); cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@ -6143,11 +6153,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } while (len > 0) { - dst_i = dst_end >> PAGE_SHIFT; - src_i = src_end >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_end); + src_i = get_eb_page_index(src_end); - dst_off_in_page = offset_in_page(dst_end); - src_off_in_page = offset_in_page(src_end); + dst_off_in_page = get_eb_offset_in_page(dst, dst_end); + src_off_in_page = get_eb_offset_in_page(dst, src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c46be27be700..8260f8bb3ff0 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -57,8 +57,9 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ - const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = PAGE_SIZE - oip; \ @@ -85,8 +86,8 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = offset_in_page(member_offset); \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ char *kaddr = page_address(eb->pages[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ @@ -106,8 +107,9 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ - const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = PAGE_SIZE - oip; \ @@ -136,8 +138,8 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = offset_in_page(member_offset); \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ char *kaddr = page_address(eb->pages[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ -- cgit v1.2.3 From 9e46458a7c0056dad98f0684c71be65a380b067b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:05 +0800 Subject: btrfs: remove btrfs_find_ordered_sum call from btrfs_lookup_bio_sums The function btrfs_lookup_bio_sums() is only called for read bios. While btrfs_find_ordered_sum() is to search ordered extent sums, which is only for write path. This means to read a page we either: - Submit read bio if it's not uptodate This means we only need to search csum tree for checksums. - The page is already uptodate It can be marked uptodate for previous read, or being marked dirty. As we always mark page uptodate for dirty page. In that case, we don't need to submit read bio at all, thus no need to search any checksums. Remove the btrfs_find_ordered_sum() call in btrfs_lookup_bio_sums(). And since btrfs_lookup_bio_sums() is the only caller for btrfs_find_ordered_sum(), also remove the implementation. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 16 +++++++++++----- fs/btrfs/ordered-data.c | 44 -------------------------------------------- fs/btrfs/ordered-data.h | 2 -- 3 files changed, 11 insertions(+), 51 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8fa98d55fcfd..443cc31dc6ce 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -239,7 +239,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } /** - * btrfs_lookup_bio_sums - Look up checksums for a bio. + * btrfs_lookup_bio_sums - Look up checksums for a read bio. + * * @inode: inode that the bio is for. * @bio: bio to look up. * @offset: Unless (u64)-1, look up checksums for this offset in the file. @@ -274,6 +275,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; + /* + * This function is only called for read bio. + * + * This means two things: + * - All our csums should only be in csum tree + * No ordered extents csums, as ordered extents are only for write + * path. + */ + ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; @@ -324,10 +334,6 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, - disk_bytenr, csum, nblocks); - if (count) - goto found; if (!item || disk_bytenr < item_start_offset || disk_bytenr >= item_last_offset) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0d61f9fefc02..79d366a36223 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -854,50 +854,6 @@ out: return entry; } -/* - * search the ordered extents for one corresponding to 'offset' and - * try to find a checksum. This is used because we allow pages to - * be reclaimed before their checksum is actually put into the btree - */ -int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, - u64 disk_bytenr, u8 *sum, int len) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *ordered_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - unsigned long num_sectors; - unsigned long i; - const u32 csum_size = fs_info->csum_size; - int index = 0; - - ordered = btrfs_lookup_ordered_extent(inode, offset); - if (!ordered) - return 0; - - spin_lock_irq(&tree->lock); - list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr && - disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - fs_info->sectorsize_bits; - num_sectors = ordered_sum->len >> fs_info->sectorsize_bits; - num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i * csum_size, - num_sectors * csum_size); - - index += (int)num_sectors * csum_size; - if (index == len) - goto out; - disk_bytenr += num_sectors * fs_info->sectorsize; - } - } -out: - spin_unlock_irq(&tree->lock); - btrfs_put_ordered_extent(ordered); - return index; -} - /* * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 367269effd6a..0bfa82b58e23 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -183,8 +183,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( u64 len); void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); -int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, - u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, -- cgit v1.2.3 From 6275193ef19033d0cca88df6209556462bbedee2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:06 +0800 Subject: btrfs: refactor btrfs_lookup_bio_sums to handle out-of-order bvecs Refactor btrfs_lookup_bio_sums() by: - Remove the @file_offset parameter There are two factors making the @file_offset parameter useless: * For csum lookup in csum tree, file offset makes no sense We only need disk_bytenr, which is unrelated to file_offset * page_offset (file offset) of each bvec is not contiguous. Pages can be added to the same bio as long as their on-disk bytenr is contiguous, meaning we could have pages at different file offsets in the same bio. Thus passing file_offset makes no sense any more. The only user of file_offset is for data reloc inode, we will use a new function, search_file_offset_in_bio(), to handle it. - Extract the csum tree lookup into search_csum_tree() The new function will handle the csum search in csum tree. The return value is the same as btrfs_find_ordered_sum(), returning the number of found sectors which have checksum. - Change how we do the main loop The only needed info from bio is: * the on-disk bytenr * the length After extracting the above info, we can do the search without bio at all, which makes the main loop much simpler: for (cur_disk_bytenr = orig_disk_bytenr; cur_disk_bytenr < orig_disk_bytenr + orig_len; cur_disk_bytenr += count * sectorsize) { /* Lookup csum tree */ count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (!count) { /* Csum hole handling */ } } - Use single variable as the source to calculate all other offsets Instead of all different type of variables, we use only one main variable, cur_disk_bytenr, which represents the current disk bytenr. All involved values can be calculated from that variable, and all those variable will only be visible in the inner loop. The above refactoring makes btrfs_lookup_bio_sums() way more robust than it used to be, especially related to the file offset lookup. Now file_offset lookup is only related to data reloc inode, otherwise we don't need to bother file_offset at all. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 5 +- fs/btrfs/ctree.h | 3 +- fs/btrfs/file-item.c | 250 ++++++++++++++++++++++++++++++++----------------- fs/btrfs/inode.c | 5 +- 4 files changed, 171 insertions(+), 92 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 12d50f1cdc58..5ae3fa0386b7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -719,8 +719,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ refcount_inc(&cb->pending_bios); - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, - sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, @@ -746,7 +745,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98d6871b2295..9dde7707873a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3013,8 +3013,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 offset, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 443cc31dc6ce..1545c22ef280 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -238,13 +238,117 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and + * estore the result to @dst. + * + * Return >0 for the number of sectors we found. + * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum + * for it. Caller may want to try next sector until one range is hit. + * Return <0 for fatal error. + */ +static int search_csum_tree(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 disk_bytenr, + u64 len, u8 *dst) +{ + struct btrfs_csum_item *item = NULL; + struct btrfs_key key; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 itemsize; + int ret; + u64 csum_start; + u64 csum_len; + + ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) && + IS_ALIGNED(len, sectorsize)); + + /* Check if the current csum item covers disk_bytenr */ + if (path->nodes[0]) { + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + + if (in_range(disk_bytenr, csum_start, csum_len)) + goto found; + } + + /* Current item doesn't contain the desired range, search again */ + btrfs_release_path(path); + item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + ASSERT(in_range(disk_bytenr, csum_start, csum_len)); + +found: + ret = (min(csum_start + csum_len, disk_bytenr + len) - + disk_bytenr) >> fs_info->sectorsize_bits; + read_extent_buffer(path->nodes[0], dst, (unsigned long)item, + ret * csum_size); +out: + if (ret == -ENOENT) + ret = 0; + return ret; +} + +/* + * Locate the file_offset of @cur_disk_bytenr of a @bio. + * + * Bio of btrfs represents read range of + * [bi_sector << 9, bi_sector << 9 + bi_size). + * Knowing this, we can iterate through each bvec to locate the page belong to + * @cur_disk_bytenr and get the file offset. + * + * @inode is used to determine if the bvec page really belongs to @inode. + * + * Return 0 if we can't find the file offset + * Return >0 if we find the file offset and restore it to @file_offset_ret + */ +static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, + u64 disk_bytenr, u64 *file_offset_ret) +{ + struct bvec_iter iter; + struct bio_vec bvec; + u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT; + int ret = 0; + + bio_for_each_segment(bvec, bio, iter) { + struct page *page = bvec.bv_page; + + if (cur > disk_bytenr) + break; + if (cur + bvec.bv_len <= disk_bytenr) { + cur += bvec.bv_len; + continue; + } + ASSERT(in_range(disk_bytenr, cur, bvec.bv_len)); + if (page->mapping && page->mapping->host && + page->mapping->host == inode) { + ret = 1; + *file_offset_ret = page_offset(page) + bvec.bv_offset + + disk_bytenr - cur; + break; + } + } + return ret; +} + /** - * btrfs_lookup_bio_sums - Look up checksums for a read bio. + * Lookup the checksum for the read bio in csum tree. * * @inode: inode that the bio is for. * @bio: bio to look up. - * @offset: Unless (u64)-1, look up checksums for this offset in the file. - * If (u64)-1, use the page offsets from the bio instead. * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If * NULL, the checksum buffer is allocated and returned in @@ -252,25 +356,19 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 offset, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec bvec; - struct bvec_iter iter; - struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; - const bool page_offsets = (offset == (u64)-1); + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 orig_len = bio->bi_iter.bi_size; + u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_bytenr; u8 *csum; - u64 item_start_offset = 0; - u64 item_last_offset = 0; - u64 disk_bytenr; - u64 page_bytes_left; - u32 diff; - int nblocks; + const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; - const u32 csum_size = fs_info->csum_size; if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; @@ -282,13 +380,16 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, * - All our csums should only be in csum tree * No ordered extents csums, as ordered extents are only for write * path. + * - No need to bother any other info from bvec + * Since we're looking up csums, the only important info is the + * disk_bytenr and the length, which can be extracted from bi_iter + * directly. */ ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; - nblocks = bio->bi_iter.bi_size >> fs_info->sectorsize_bits; if (!dst) { struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); @@ -325,81 +426,62 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path->skip_locking = 1; } - disk_bytenr = bio->bi_iter.bi_sector << 9; + for (cur_disk_bytenr = orig_disk_bytenr; + cur_disk_bytenr < orig_disk_bytenr + orig_len; + cur_disk_bytenr += (count * sectorsize)) { + u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr; + unsigned int sector_offset; + u8 *csum_dst; - bio_for_each_segment(bvec, bio, iter) { - page_bytes_left = bvec.bv_len; - if (count) - goto next; + /* + * Although both cur_disk_bytenr and orig_disk_bytenr is u64, + * we're calculating the offset to the bio start. + * + * Bio size is limited to UINT_MAX, thus unsigned int is large + * enough to contain the raw result, not to mention the right + * shifted result. + */ + ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); + sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> + fs_info->sectorsize_bits; + csum_dst = csum + sector_offset * csum_size; - if (page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; + count = search_csum_tree(fs_info, path, cur_disk_bytenr, + search_len, csum_dst); + if (count <= 0) { + /* + * Either we hit a critical error or we didn't find + * the csum. + * Either way, we put zero into the csums dst, and skip + * to the next sector. + */ + memset(csum_dst, 0, csum_size); + count = 1; - if (!item || disk_bytenr < item_start_offset || - disk_bytenr >= item_last_offset) { - struct btrfs_key found_key; - u32 item_size; - - if (item) - btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, fs_info->csum_root, - path, disk_bytenr, 0); - if (IS_ERR(item)) { - count = 1; - memset(csum, 0, csum_size); - if (BTRFS_I(inode)->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - set_extent_bits(io_tree, offset, - offset + fs_info->sectorsize - 1, + /* + * For data reloc inode, we need to mark the range + * NODATASUM so that balance won't report false csum + * error. + */ + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + u64 file_offset; + int ret; + + ret = search_file_offset_in_bio(bio, inode, + cur_disk_bytenr, &file_offset); + if (ret) + set_extent_bits(io_tree, file_offset, + file_offset + sectorsize - 1, EXTENT_NODATASUM); - } else { - btrfs_info_rl(fs_info, - "no csum found for inode %llu start %llu", - btrfs_ino(BTRFS_I(inode)), offset); - } - item = NULL; - btrfs_release_path(path); - goto found; + } else { + btrfs_warn_rl(fs_info, + "csum hole found for disk bytenr range [%llu, %llu)", + cur_disk_bytenr, cur_disk_bytenr + sectorsize); } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - - item_start_offset = found_key.offset; - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - item_last_offset = item_start_offset + - (item_size / csum_size) * - fs_info->sectorsize; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_csum_item); - } - /* - * this byte range must be able to fit inside - * a single leaf so it will also fit inside a u32 - */ - diff = disk_bytenr - item_start_offset; - diff = diff >> fs_info->sectorsize_bits; - diff = diff * csum_size; - count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> - fs_info->sectorsize_bits); - read_extent_buffer(path->nodes[0], csum, - ((unsigned long)item) + diff, - csum_size * count); -found: - csum += count * csum_size; - nblocks -= count; -next: - while (count > 0) { - count--; - disk_bytenr += fs_info->sectorsize; - offset += fs_info->sectorsize; - page_bytes_left -= fs_info->sectorsize; - if (!page_bytes_left) - break; /* move to next bio */ } } - WARN_ON_ONCE(count); btrfs_free_path(path); return BLK_STS_OK; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index af19a9aef1e5..8e23780acfae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2268,7 +2268,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, * need to csum or not, which is why we ignore skip_sum * here. */ - ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) goto out; } @@ -7964,8 +7964,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, * * If we have csums disabled this will do nothing. */ - status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, - dip->csums); + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; } -- cgit v1.2.3 From fa485d21a7ae712fef8e943d1dd3ca7b27cb392e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:07 +0800 Subject: btrfs: scrub: reduce width of extent_len/stripe_len from 64 to 32 bits Btrfs on-disk format chose to use u64 for almost everything, but there are a other restrictions that won't let us use more than u32 for things like extent length (the maximum length is 128MiB for non-hole extents), or stripe length (we have device number limit). This means if we don't have extra handling to convert u64 to u32, we will always have some questionable operations like "u32 = u64 >> sectorsize_bits" in the code. This patch will try to address the problem by reducing the width for the following members/parameters: - scrub_parity::stripe_len - @len of scrub_pages() - @extent_len of scrub_remap_extent() - @len of scrub_parity_mark_sectors_error() - @len of scrub_parity_mark_sectors_data() - @len of scrub_extent() - @len of scrub_pages_for_parity() - @len of scrub_extent_for_parity() For members extracted from on-disk structure, like map->stripe_len, they will be kept as is. Since that modification would require on-disk format change. There will be cases like "u32 = u64 - u64" or "u32 = u64", for such call sites, extra ASSERT() is added to be extra safe for debug builds. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 78759bc9c980..8026606f7510 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -130,7 +130,7 @@ struct scrub_parity { int nsectors; - u64 stripe_len; + u32 stripe_len; refcount_t refs; @@ -233,7 +233,7 @@ static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace); @@ -241,7 +241,7 @@ static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u64 extent_len, + u64 extent_logical, u32 extent_len, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); @@ -2147,7 +2147,7 @@ bbio_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) @@ -2171,7 +2171,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u64 l = min_t(u64, len, PAGE_SIZE); + u32 l = min_t(u32, len, PAGE_SIZE); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2292,10 +2292,9 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, - u64 start, u64 len) + u64 start, u32 len) { u64 offset; - u64 nsectors64; u32 nsectors; u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; @@ -2307,10 +2306,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, start -= sparity->logic_start; start = div64_u64_rem(start, sparity->stripe_len, &offset); offset = offset >> sectorsize_bits; - nsectors64 = len >> sectorsize_bits; - - ASSERT(nsectors64 < UINT_MAX); - nsectors = (u32)nsectors64; + nsectors = len >> sectorsize_bits; if (offset + nsectors <= sparity->nsectors) { bitmap_set(bitmap, offset, nsectors); @@ -2322,13 +2318,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, - u64 start, u64 len) + u64 start, u32 len) { __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, - u64 start, u64 len) + u64 start, u32 len) { __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); } @@ -2356,6 +2352,7 @@ static void scrub_block_complete(struct scrub_block *sblock) u64 end = sblock->pagev[sblock->page_count - 1]->logical + PAGE_SIZE; + ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, start, end - start); } @@ -2425,7 +2422,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u64 physical_for_dev_replace) { @@ -2457,7 +2454,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, } while (len) { - u64 l = min_t(u64, len, blocksize); + u32 l = min(len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -2480,7 +2477,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, } static int scrub_pages_for_parity(struct scrub_parity *sparity, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) { @@ -2506,7 +2503,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u64 l = min_t(u64, len, PAGE_SIZE); + u32 l = min_t(u32, len, PAGE_SIZE); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2564,7 +2561,7 @@ leave_nomem: } static int scrub_extent_for_parity(struct scrub_parity *sparity, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num) { @@ -2588,7 +2585,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } while (len) { - u64 l = min_t(u64, len, blocksize); + u32 l = min(len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -2792,7 +2789,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 generation; u64 extent_logical; u64 extent_physical; - u64 extent_len; + /* Check the comment in scrub_stripe() for why u32 is enough here */ + u32 extent_len; u64 mapped_length; struct btrfs_device *extent_dev; struct scrub_parity *sparity; @@ -2801,6 +2799,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; + ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, @@ -2812,6 +2811,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, return -ENOMEM; } + ASSERT(map->stripe_len <= U32_MAX); sparity->stripe_len = map->stripe_len; sparity->nsectors = nsectors; sparity->sctx = sctx; @@ -2906,6 +2906,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, } again: extent_logical = key.objectid; + ASSERT(bytes <= U32_MAX); extent_len = bytes; if (extent_logical < logic_start) { @@ -2984,9 +2985,11 @@ next: logic_start += map->stripe_len; } out: - if (ret < 0) + if (ret < 0) { + ASSERT(logic_end - logic_start <= U32_MAX); scrub_parity_mark_sectors_error(sparity, logic_start, logic_end - logic_start); + } scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3028,7 +3031,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 offset; u64 extent_logical; u64 extent_physical; - u64 extent_len; + /* + * Unlike chunk length, extent length should never go beyond + * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. + */ + u32 extent_len; u64 stripe_logical; u64 stripe_end; struct btrfs_device *extent_dev; @@ -3277,6 +3284,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, again: extent_logical = key.objectid; + ASSERT(bytes <= U32_MAX); extent_len = bytes; /* @@ -4074,7 +4082,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, } static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u64 extent_len, + u64 extent_logical, u32 extent_len, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num) -- cgit v1.2.3 From d0a7a9c050f3d0e11626ee5b3cebb0e4388ffce6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:08 +0800 Subject: btrfs: scrub: always allocate one full page for one sector for RAID56 For scrub_pages() and scrub_pages_for_parity(), we currently allocate one scrub_page structure for one page. This is fine if we only read/write one sector one time. But for cases like scrubbing RAID56, we need to read/write the full stripe, which is in 64K size for now. For subpage size, we will submit the read in just one page, which is normally a good thing, but for RAID56 case, it only expects to see one sector, not the full stripe in its endio function. This could lead to wrong parity checksum for RAID56 on subpage. To make the existing code work well for subpage case, here we take a shortcut by always allocating a full page for one sector. This should provide the base to make RAID56 work for subpage case. The cost is pretty obvious now, for one RAID56 stripe now we always need 16 pages. For support subpage situation (64K page size, 4K sector size), this means we need full one megabyte to scrub just one RAID56 stripe. And for data scrub, each 4K sector will also need one 64K page. This is mostly just a workaround, the proper fix for this is a much larger project, using scrub_block to replace scrub_page, and allow scrub_block to handle multi pages, csums, and csum_bitmap to avoid allocating one page for each sector. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8026606f7510..a1bf87958f8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2153,6 +2153,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical_for_dev_replace) { struct scrub_block *sblock; + const u32 sectorsize = sctx->fs_info->sectorsize; int index; sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); @@ -2171,7 +2172,12 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u32 l = min_t(u32, len, PAGE_SIZE); + /* + * Here we will allocate one page for one sector to scrub. + * This is fine if PAGE_SIZE == sectorsize, but will cost + * more memory for PAGE_SIZE > sectorsize case. + */ + u32 l = min(sectorsize, len); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2483,8 +2489,11 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, { struct scrub_ctx *sctx = sparity->sctx; struct scrub_block *sblock; + const u32 sectorsize = sctx->fs_info->sectorsize; int index; + ASSERT(IS_ALIGNED(len, sectorsize)); + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); if (!sblock) { spin_lock(&sctx->stat_lock); @@ -2503,7 +2512,6 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u32 l = min_t(u32, len, PAGE_SIZE); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2538,9 +2546,12 @@ leave_nomem: spage->page = alloc_page(GFP_KERNEL); if (!spage->page) goto leave_nomem; - len -= l; - logical += l; - physical += l; + + + /* Iterate over the stripe range in sectorsize steps */ + len -= sectorsize; + logical += sectorsize; + physical += sectorsize; } WARN_ON(sblock->page_count == 0); -- cgit v1.2.3 From 53f3251d3b82f70c762cb7d963d70fb65f49e22c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:09 +0800 Subject: btrfs: scrub: support subpage tree block scrub To support subpage tree block scrub, scrub_checksum_tree_block() only needs to learn 2 new tricks: - Follow sector size Now scrub_page only represents one sector, we need to follow it properly. - Run checksum on all sectors Since scrub_page only represents one sector, we need to run checksum on all sectors, not only (nodesize >> PAGE_SIZE). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a1bf87958f8e..5ff6ee70ed2b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1811,12 +1811,22 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; + /* + * This is done in sectorsize steps even for metadata as there's a + * constraint for nodesize to be aligned to sectorsize. This will need + * to change so we don't misuse data and metadata units like that. + */ + const u32 sectorsize = sctx->fs_info->sectorsize; + const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; struct scrub_page *spage; char *kaddr; BUG_ON(sblock->page_count < 1); + + /* Each member in pagev is just one block, not a full page */ + ASSERT(sblock->page_count == num_sectors); + spage = sblock->pagev[0]; kaddr = page_address(spage->page); h = (struct btrfs_header *)kaddr; @@ -1845,11 +1855,11 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - PAGE_SIZE - BTRFS_CSUM_SIZE); + sectorsize - BTRFS_CSUM_SIZE); - for (i = 1; i < num_pages; i++) { + for (i = 1; i < num_sectors; i++) { kaddr = page_address(sblock->pagev[i]->page); - crypto_shash_update(shash, kaddr, PAGE_SIZE); + crypto_shash_update(shash, kaddr, sectorsize); } crypto_shash_final(shash, calculated_csum); -- cgit v1.2.3 From b29dca44abe216a9c29842593cbc18f9a3fe57d2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:10 +0800 Subject: btrfs: scrub: support subpage data scrub Btrfs scrub is more flexible than buffered data write path, as we can read an unaligned subpage data into page offset 0. This ability makes subpage support much easier, we just need to check each scrub_page::page_len and ensure we only calculate hash for [0, page_len) of a page. There is a small thing to notice: for subpage case, we still do sector by sector scrub. This means we will submit a read bio for each sector to scrub, resulting in the same amount of read bios, just like on the 4K page systems. This behavior can be considered as a good thing, if we want everything to be the same as 4K page systems. But this also means, we're wasting the possibility to submit larger bio using 64K page size. This is another problem to consider in the future. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5ff6ee70ed2b..00429e4e64f4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1795,11 +1795,15 @@ static int scrub_checksum_data(struct scrub_block *sblock) shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - if (memcmp(csum, spage->csum, sctx->fs_info->csum_size)) - sblock->checksum_error = 1; + /* + * In scrub_pages() and scrub_pages_for_parity() we ensure each spage + * only contains one sector of data. + */ + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + if (memcmp(csum, spage->csum, fs_info->csum_size)) + sblock->checksum_error = 1; return sblock->checksum_error; } -- cgit v1.2.3 From b42fe98c92698d2a10094997e5f4d2dd968fd44f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Dec 2020 14:48:11 +0800 Subject: btrfs: scrub: allow scrub to work with subpage sectorsize Since btrfs scrub is utilizing its own infrastructure to submit read/write, scrub is independent from all other routines. This brings one very neat feature, allow us to read 4K data into offset 0 of a 64K page. So is the writeback routine. This makes scrub on subpage sector size much easier to implement, and thanks to previous commits which just changed the implementation to always do scrub based on sector size, now scrub can handle subpage filesystem without any problem. This patch will just remove the restriction on (sectorsize != PAGE_SIZE), to make scrub finally work on subpage filesystems. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 00429e4e64f4..5f4f88a4d2c8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3885,14 +3885,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - if (fs_info->sectorsize != PAGE_SIZE) { - /* not supported for data w/o checksums */ - btrfs_err_rl(fs_info, - "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails", - fs_info->sectorsize, PAGE_SIZE); - return -EINVAL; - } - if (fs_info->nodesize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { -- cgit v1.2.3