diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 19:41:22 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 19:41:22 +0300 |
commit | 87045e6546078dae215d1bd3b2bc82b3ada3ca77 (patch) | |
tree | f3d816b9834ca959514f6e399fb9f505871d2729 /fs/btrfs/block-group.c | |
parent | 9c849ce86e0fa93a218614eac562ace44053d7ce (diff) | |
parent | 0d977e0eba234e01a60bdde27314dc21374201b3 (diff) | |
download | linux-87045e6546078dae215d1bd3b2bc82b3ada3ca77.tar.xz |
Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"The highlights of this round are integrations with fs-verity and
idmapped mounts, the rest is usual mix of minor improvements, speedups
and cleanups.
There are some patches outside of btrfs, namely updating some VFS
interfaces, all straightforward and acked.
Features:
- fs-verity support, using standard ioctls, backward compatible with
read-only limitation on inodes with previously enabled fs-verity
- idmapped mount support
- make mount with rescue=ibadroots more tolerant to partially damaged
trees
- allow raid0 on a single device and raid10 on two devices,
degenerate cases but might be useful as an intermediate step during
conversion to other profiles
- zoned mode block group auto reclaim can be disabled via sysfs knob
Performance improvements:
- continue readahead of node siblings even if target node is in
memory, could speed up full send (on sample test +11%)
- batching of delayed items can speed up creating many files
- fsync/tree-log speedups
- avoid unnecessary work (gains +2% throughput, -2% run time on
sample load)
- reduced lock contention on renames (on dbench +4% throughput,
up to -30% latency)
Fixes:
- various zoned mode fixes
- preemptive flushing threshold tuning, avoid excessive work on
almost full filesystems
Core:
- continued subpage support, preparation for implementing remaining
features like compression and defragmentation; with some
limitations, write is now enabled on 64K page systems with 4K
sectors, still considered experimental
- no readahead on compressed reads
- inline extents disabled
- disabled raid56 profile conversion and mount
- improved flushing logic, fixing early ENOSPC on some workloads
- inode flags have been internally split to read-only and read-write
incompat bit parts, used by fs-verity
- new tree items for fs-verity
- descriptor item
- Merkle tree item
- inode operations extended to be namespace-aware
- cleanups and refactoring
Generic code changes:
- fs: new export filemap_fdatawrite_wbc
- fs: removed sync_inode
- block: bio_trim argument type fixups
- vfs: add namespace-aware lookup"
* tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (114 commits)
btrfs: reset replace target device to allocation state on close
btrfs: zoned: fix ordered extent boundary calculation
btrfs: do not do preemptive flushing if the majority is global rsv
btrfs: reduce the preemptive flushing threshold to 90%
btrfs: tree-log: check btrfs_lookup_data_extent return value
btrfs: avoid unnecessarily logging directories that had no changes
btrfs: allow idmapped mount
btrfs: handle ACLs on idmapped mounts
btrfs: allow idmapped INO_LOOKUP_USER ioctl
btrfs: allow idmapped SUBVOL_SETFLAGS ioctl
btrfs: allow idmapped SET_RECEIVED_SUBVOL ioctls
btrfs: relax restrictions for SNAP_DESTROY_V2 with subvolids
btrfs: allow idmapped SNAP_DESTROY ioctls
btrfs: allow idmapped SNAP_CREATE/SUBVOL_CREATE ioctls
btrfs: check whether fsgid/fsuid are mapped during subvolume creation
btrfs: allow idmapped permission inode op
btrfs: allow idmapped setattr inode op
btrfs: allow idmapped tmpfile inode op
btrfs: allow idmapped symlink inode op
btrfs: allow idmapped mkdir inode op
...
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r-- | fs/btrfs/block-group.c | 114 |
1 files changed, 111 insertions, 3 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9e7d9d0c763d..a3b830b8410a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret) + if (ret && ret != -EAGAIN) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); @@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->used = em->len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); + /* + * We may have some valid block group cache added already, in + * that case we skip to the next one. + */ + if (ret == -EEXIST) { + ret = 0; + btrfs_put_block_group(bg); + continue; + } + if (ret) { btrfs_remove_free_space_cache(bg); btrfs_put_block_group(bg); break; } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, 0, 0, &space_info); bg->space_info = space_info; @@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); + /* + * We've hit some error while reading the extent tree, and have + * rescue=ibadroots mount option. + * Try to fill the tree using dummy block groups so that the user can + * continue to mount and grab their data. + */ + if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) + ret = fill_dummy_bgs(info); return ret; } @@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } +static int insert_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); + if (ret) + goto out; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +/* + * This function belongs to phase 2. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +static int insert_dev_extents(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + u64 dev_offset; + u64 stripe_size; + int i; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + stripe_size = em->orig_block_len; + + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * resulting in persisting a device extent item with such ID. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, + stripe_size); + if (ret) + break; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + free_extent_map(em); + return ret; +} + /* * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of * chunk allocation. @@ -2278,8 +2386,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); } - ret = btrfs_finish_chunk_alloc(trans, block_group->start, - block_group->length); + ret = insert_dev_extents(trans, block_group->start, + block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); |