summaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c183
1 files changed, 98 insertions, 85 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bbcc3df77646..c6907d533fe8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + folio_size(folio));
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
@@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ /* GFP flags are compatible with XA_FLAGS_*. */
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -2618,9 +2639,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2724,7 +2742,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2813,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2931,17 +2953,6 @@ out:
}
/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
@@ -2953,7 +2964,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3276,13 +3291,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,28 +3304,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
-
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -3494,29 +3504,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3542,7 +3529,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3570,8 +3557,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3608,7 +3593,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -4391,7 +4376,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
@@ -4799,6 +4784,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
}
+static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+
+ btrfs_qgroup_free_meta_all_pertrans(root);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
@@ -4827,6 +4838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+ btrfs_free_all_qgroup_pertrans(fs_info);
+
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}