summaryrefslogtreecommitdiff
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-08-31 19:41:22 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-08-31 19:41:22 +0300
commit87045e6546078dae215d1bd3b2bc82b3ada3ca77 (patch)
treef3d816b9834ca959514f6e399fb9f505871d2729 /fs/btrfs/volumes.c
parent9c849ce86e0fa93a218614eac562ace44053d7ce (diff)
parent0d977e0eba234e01a60bdde27314dc21374201b3 (diff)
downloadlinux-87045e6546078dae215d1bd3b2bc82b3ada3ca77.tar.xz
Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "The highlights of this round are integrations with fs-verity and idmapped mounts, the rest is usual mix of minor improvements, speedups and cleanups. There are some patches outside of btrfs, namely updating some VFS interfaces, all straightforward and acked. Features: - fs-verity support, using standard ioctls, backward compatible with read-only limitation on inodes with previously enabled fs-verity - idmapped mount support - make mount with rescue=ibadroots more tolerant to partially damaged trees - allow raid0 on a single device and raid10 on two devices, degenerate cases but might be useful as an intermediate step during conversion to other profiles - zoned mode block group auto reclaim can be disabled via sysfs knob Performance improvements: - continue readahead of node siblings even if target node is in memory, could speed up full send (on sample test +11%) - batching of delayed items can speed up creating many files - fsync/tree-log speedups - avoid unnecessary work (gains +2% throughput, -2% run time on sample load) - reduced lock contention on renames (on dbench +4% throughput, up to -30% latency) Fixes: - various zoned mode fixes - preemptive flushing threshold tuning, avoid excessive work on almost full filesystems Core: - continued subpage support, preparation for implementing remaining features like compression and defragmentation; with some limitations, write is now enabled on 64K page systems with 4K sectors, still considered experimental - no readahead on compressed reads - inline extents disabled - disabled raid56 profile conversion and mount - improved flushing logic, fixing early ENOSPC on some workloads - inode flags have been internally split to read-only and read-write incompat bit parts, used by fs-verity - new tree items for fs-verity - descriptor item - Merkle tree item - inode operations extended to be namespace-aware - cleanups and refactoring Generic code changes: - fs: new export filemap_fdatawrite_wbc - fs: removed sync_inode - block: bio_trim argument type fixups - vfs: add namespace-aware lookup" * tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (114 commits) btrfs: reset replace target device to allocation state on close btrfs: zoned: fix ordered extent boundary calculation btrfs: do not do preemptive flushing if the majority is global rsv btrfs: reduce the preemptive flushing threshold to 90% btrfs: tree-log: check btrfs_lookup_data_extent return value btrfs: avoid unnecessarily logging directories that had no changes btrfs: allow idmapped mount btrfs: handle ACLs on idmapped mounts btrfs: allow idmapped INO_LOOKUP_USER ioctl btrfs: allow idmapped SUBVOL_SETFLAGS ioctl btrfs: allow idmapped SET_RECEIVED_SUBVOL ioctls btrfs: relax restrictions for SNAP_DESTROY_V2 with subvolids btrfs: allow idmapped SNAP_DESTROY ioctls btrfs: allow idmapped SNAP_CREATE/SUBVOL_CREATE ioctls btrfs: check whether fsgid/fsuid are mapped during subvolume creation btrfs: allow idmapped permission inode op btrfs: allow idmapped setattr inode op btrfs: allow idmapped tmpfile inode op btrfs: allow idmapped symlink inode op btrfs: allow idmapped mkdir inode op ...
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c234
1 files changed, 71 insertions, 163 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 70f94b75f25a..ec3a874165de 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
+ .devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
- .devs_min = 2,
+ .devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
}
}
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_device *dev;
-
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- INIT_LIST_HEAD(&dev->dev_list);
- INIT_LIST_HEAD(&dev->dev_alloc_list);
- INIT_LIST_HEAD(&dev->post_commit_list);
-
- atomic_set(&dev->reada_in_flight, 0);
- atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
-
- return dev;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -1130,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
@@ -1228,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
static int devid_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct btrfs_device *dev1, *dev2;
+ const struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
@@ -1598,14 +1589,9 @@ again:
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
- if (ret > 0) {
- ret = btrfs_previous_item(root, path, key.objectid, key.type);
- if (ret < 0)
- goto out;
- }
while (1) {
l = path->nodes[0];
@@ -1759,48 +1745,6 @@ out:
return ret;
}
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_offset, u64 start, u64 num_bytes)
-{
- int ret;
- struct btrfs_path *path;
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_dev_extent *extent;
- struct extent_buffer *leaf;
- struct btrfs_key key;
-
- WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
- WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = device->devid;
- key.offset = start;
- key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_dev_extent);
- btrfs_set_dev_extent_chunk_tree(leaf, extent,
- BTRFS_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_objectid(leaf, extent,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
- btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
@@ -2003,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
- if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_array[i].mindev_error;
-
- if (ret)
- return ret;
- }
+ if (num_devices < btrfs_raid_array[i].devs_min)
+ return btrfs_raid_array[i].mindev_error;
}
return 0;
@@ -2137,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
@@ -3622,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
- if (nparity)
- return num_stripes - nparity;
- else
- return num_stripes / ncopies;
+ return (num_stripes - nparity) / ncopies;
}
/* [pstart, pend) */
@@ -4025,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ btrfs_err(fs_info,
+ "RAID56 is not yet supported for sectorsize %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ return false;
+ }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -5464,56 +5408,6 @@ out:
}
/*
- * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
- *
- * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
- * phases.
- */
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
- u64 dev_offset;
- u64 stripe_size;
- int i;
- int ret = 0;
-
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
-
- /*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with the map's stripes, because the device object's id can change
- * at any time during that final phase of the device replace operation
- * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
- * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
- * resulting in persisting a device extent item with such ID.
- */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
- dev_offset, stripe_size);
- if (ret)
- break;
- }
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
- free_extent_map(em);
- return ret;
-}
-
-/*
* This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
* phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
* chunks.
@@ -6923,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device(fs_info);
- if (IS_ERR(dev))
- return dev;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Preallocate a bio that's always going to be used for flushing device
+ * barriers and matches the device lifespan
+ */
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+ if (!dev->flush_bio) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->post_commit_list);
+
+ atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
if (devid)
tmp = *devid;
@@ -6961,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
- int index = btrfs_bg_flags_to_raid_index(type);
- int ncopies = btrfs_raid_array[index].ncopies;
- const int nparity = btrfs_raid_array[index].nparity;
- int data_stripes;
-
- if (nparity)
- data_stripes = num_stripes - nparity;
- else
- data_stripes = num_stripes / ncopies;
+ const int data_stripes = calc_data_stripes(type, num_stripes);
return div_u64(chunk_len, data_stripes);
}
@@ -8144,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */