summaryrefslogtreecommitdiff
path: root/fs/btrfs/zoned.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r--fs/btrfs/zoned.c452
1 files changed, 289 insertions, 163 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..3504ade30cb0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
return ret;
}
+struct zone_info {
+ u64 physical;
+ u64 capacity;
+ u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+ struct zone_info *info, unsigned long *active,
+ struct map_lookup *map)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *device = map->stripes[zone_idx].dev;
+ int dev_replace_is_ongoing = 0;
+ unsigned int nofs_flag;
+ struct blk_zone zone;
+ int ret;
+
+ info->physical = map->stripes[zone_idx].physical;
+
+ if (!device->bdev) {
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ /* Consider a zone as active if we can allow any number of active zones. */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(zone_idx, active);
+
+ if (!btrfs_dev_is_sequential(device, info->physical)) {
+ info->alloc_offset = WP_CONVENTIONAL;
+ return 0;
+ }
+
+ /* This zone will be used for allocation, so mark this zone non-empty. */
+ btrfs_dev_clear_zone_empty(device, info->physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write pointer
+ * to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, info->physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ if (ret != -EIO && ret != -EOPNOTSUPP)
+ return ret;
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ device->devid);
+ return -EIO;
+ }
+
+ info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ (info->physical >> device->zone_info->zone_size_shift),
+ rcu_str_deref(device->name), device->devid);
+ info->alloc_offset = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ info->alloc_offset = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ info->alloc_offset = info->capacity;
+ break;
+ default:
+ /* Partially used zone. */
+ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(zone_idx, active);
+ break;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+ struct zone_info *info,
+ unsigned long *active)
+{
+ if (info->alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ info->physical);
+ return -EIO;
+ }
+
+ bg->alloc_offset = info->alloc_offset;
+ bg->zone_capacity = info->capacity;
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+ return -EINVAL;
+ }
+
+ if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[0].physical);
+ return -EIO;
+ }
+ if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[1].physical);
+ return -EIO;
+ }
+ if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+ btrfs_err(bg->fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ return -EIO;
+ }
+
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else if (test_bit(0, active)) {
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+ return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ int i;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EIO;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg)) {
+ return -EIO;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+ zone_info[1].capacity);
+ }
+
+ if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ else
+ bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ if ((i % map->sub_stripes) == 0) {
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
+ struct zone_info *zone_info = NULL;
int ret;
int i;
- unsigned int nofs_flag;
- u64 *alloc_offsets = NULL;
- u64 *caps = NULL;
- u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
- if (!alloc_offsets) {
- ret = -ENOMEM;
- goto out;
- }
-
- caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
- if (!caps) {
- ret = -ENOMEM;
- goto out;
- }
-
- physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
- if (!physical) {
+ zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+ if (!zone_info) {
ret = -ENOMEM;
goto out;
}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- bool is_sequential;
- struct blk_zone zone;
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
-
- device = map->stripes[i].dev;
- physical[i] = map->stripes[i].physical;
-
- if (device->bdev == NULL) {
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- }
-
- is_sequential = btrfs_dev_is_sequential(device, physical[i]);
- if (is_sequential)
- num_sequential++;
- else
- num_conventional++;
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
-
- if (!is_sequential) {
- alloc_offsets[i] = WP_CONVENTIONAL;
- continue;
- }
-
- /*
- * This zone will be used for allocation, so mark this zone
- * non-empty.
- */
- btrfs_dev_clear_zone_empty(device, physical[i]);
-
- down_read(&dev_replace->rwsem);
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
- up_read(&dev_replace->rwsem);
-
- /*
- * The group is mapped to a sequential zone. Get the zone write
- * pointer to determine the allocation offset within the zone.
- */
- WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical[i], &zone);
- memalloc_nofs_restore(nofs_flag);
- if (ret == -EIO || ret == -EOPNOTSUPP) {
- ret = 0;
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- } else if (ret) {
- goto out;
- }
-
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
- "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT,
- rcu_str_deref(device->name), device->devid);
- ret = -EIO;
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ if (ret)
goto out;
- }
- caps[i] = (zone.capacity << SECTOR_SHIFT);
-
- switch (zone.cond) {
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- btrfs_err(fs_info,
- "zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical[i] >> device->zone_info->zone_size_shift,
- rcu_str_deref(device->name), device->devid);
- alloc_offsets[i] = WP_MISSING_DEV;
- break;
- case BLK_ZONE_COND_EMPTY:
- alloc_offsets[i] = 0;
- break;
- case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = caps[i];
- break;
- default:
- /* Partially used zone */
- alloc_offsets[i] =
- ((zone.wp - zone.start) << SECTOR_SHIFT);
- __set_bit(i, active);
- break;
- }
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ num_conventional++;
+ else
+ num_sequential++;
}
if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = caps[0];
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
- ret = -EINVAL;
- goto out;
- }
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[1] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[1]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[0] != alloc_offsets[1]) {
- btrfs_err(fs_info,
- "zoned: write pointer offset mismatch of zones in DUP profile");
- ret = -EIO;
- goto out;
- }
- if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(cache)) {
- ret = -EIO;
- goto out;
- }
- } else {
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
- &cache->runtime_flags);
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = min(caps[0], caps[1]);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
- /* non-single profiles are not supported yet */
default:
btrfs_err(fs_info, "zoned: profile %s not yet supported",
btrfs_bg_type_to_raid_name(map->type));
@@ -1570,9 +1698,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(physical);
- kfree(caps);
- kfree(alloc_offsets);
+ kfree(zone_info);
free_extent_map(em);
return ret;
@@ -1609,7 +1735,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
set_extent_buffer_dirty(eb);
set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+ EXTENT_DIRTY, NULL);
}
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2013,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int i, ret;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc, NULL, NULL, 1);
+ &mapped_length, &bioc, NULL, NULL);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;