summaryrefslogtreecommitdiff
path: root/fs/btrfs/zoned.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r--fs/btrfs/zoned.c292
1 files changed, 231 insertions, 61 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 72b90bc19a19..09bc325d075d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -65,6 +65,9 @@
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+static void wait_eb_writebacks(struct btrfs_block_group *block_group);
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);
+
static inline bool sb_zone_is_full(const struct blk_zone *zone)
{
return (zone->cond == BLK_ZONE_COND_FULL) ||
@@ -465,8 +468,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
* use the cache.
*/
if (populate_cache && bdev_is_zoned(device->bdev)) {
- zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
- zone_info->nr_zones);
+ zone_info->zone_cache = vcalloc(zone_info->nr_zones,
+ sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
btrfs_err_in_rcu(device->fs_info,
"zoned: failed to allocate zone cache for %s",
@@ -1583,19 +1586,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
-
- /* Check for block groups never get activated */
- if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
- cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
- !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
- cache->alloc_offset == 0) {
- unusable = cache->length;
- free = 0;
- } else {
- unusable = (cache->alloc_offset - cache->used) +
- (cache->length - cache->zone_capacity);
- free = cache->zone_capacity - cache->alloc_offset;
- }
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->cached = BTRFS_CACHE_FINISHED;
@@ -1707,10 +1700,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_sum *sum =
- list_first_entry(&ordered->list, typeof(*sum), list);
- u64 logical = sum->logical;
- u64 len = sum->len;
+ struct btrfs_ordered_sum *sum;
+ u64 logical, len;
+
+ /*
+ * Write to pre-allocated region is for the data relocation, and so
+ * it should use WRITE operation. No split/rewrite are necessary.
+ */
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
+ return;
+
+ ASSERT(!list_empty(&ordered->list));
+ /* The ordered->list can be empty in the above pre-alloc case. */
+ sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
+ logical = sum->logical;
+ len = sum->len;
while (len < ordered->disk_num_bytes) {
sum = list_next_entry(sum, list);
@@ -1747,41 +1751,121 @@ out:
}
}
-bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_block_group **cache_ret)
+static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
+ struct btrfs_block_group **active_bg)
{
- struct btrfs_block_group *cache;
- bool ret = true;
+ const struct writeback_control *wbc = ctx->wbc;
+ struct btrfs_block_group *block_group = ctx->zoned_bg;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
- if (!btrfs_is_zoned(fs_info))
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
return true;
- cache = btrfs_lookup_block_group(fs_info, eb->start);
- if (!cache)
- return true;
+ if (fs_info->treelog_bg == block_group->start) {
+ if (!btrfs_zone_activate(block_group)) {
+ int ret_fin = btrfs_zone_finish_one_bg(fs_info);
- if (cache->meta_write_pointer != eb->start) {
- btrfs_put_block_group(cache);
- cache = NULL;
- ret = false;
- } else {
- cache->meta_write_pointer = eb->start + eb->len;
- }
+ if (ret_fin != 1 || !btrfs_zone_activate(block_group))
+ return false;
+ }
+ } else if (*active_bg != block_group) {
+ struct btrfs_block_group *tgt = *active_bg;
- *cache_ret = cache;
+ /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
+ lockdep_assert_held(&fs_info->zoned_meta_io_lock);
- return ret;
+ if (tgt) {
+ /*
+ * If there is an unsent IO left in the allocated area,
+ * we cannot wait for them as it may cause a deadlock.
+ */
+ if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
+ if (wbc->sync_mode == WB_SYNC_NONE ||
+ (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
+ return false;
+ }
+
+ /* Pivot active metadata/system block group. */
+ btrfs_zoned_meta_io_unlock(fs_info);
+ wait_eb_writebacks(tgt);
+ do_zone_finish(tgt, true);
+ btrfs_zoned_meta_io_lock(fs_info);
+ if (*active_bg == tgt) {
+ btrfs_put_block_group(tgt);
+ *active_bg = NULL;
+ }
+ }
+ if (!btrfs_zone_activate(block_group))
+ return false;
+ if (*active_bg != block_group) {
+ ASSERT(*active_bg == NULL);
+ *active_bg = block_group;
+ btrfs_get_block_group(block_group);
+ }
+ }
+
+ return true;
}
-void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
- struct extent_buffer *eb)
+/*
+ * Check if @ctx->eb is aligned to the write pointer.
+ *
+ * Return:
+ * 0: @ctx->eb is at the write pointer. You can write it.
+ * -EAGAIN: There is a hole. The caller should handle the case.
+ * -EBUSY: There is a hole, but the caller can just bail out.
+ */
+int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_write_context *ctx)
{
- if (!btrfs_is_zoned(eb->fs_info) || !cache)
- return;
+ const struct writeback_control *wbc = ctx->wbc;
+ const struct extent_buffer *eb = ctx->eb;
+ struct btrfs_block_group *block_group = ctx->zoned_bg;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ if (block_group) {
+ if (block_group->start > eb->start ||
+ block_group->start + block_group->length <= eb->start) {
+ btrfs_put_block_group(block_group);
+ block_group = NULL;
+ ctx->zoned_bg = NULL;
+ }
+ }
+
+ if (!block_group) {
+ block_group = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!block_group)
+ return 0;
+ ctx->zoned_bg = block_group;
+ }
+
+ if (block_group->meta_write_pointer == eb->start) {
+ struct btrfs_block_group **tgt;
+
+ if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
+ return 0;
- ASSERT(cache->meta_write_pointer == eb->start + eb->len);
- cache->meta_write_pointer = eb->start;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ tgt = &fs_info->active_system_bg;
+ else
+ tgt = &fs_info->active_meta_bg;
+ if (check_bg_is_active(ctx, tgt))
+ return 0;
+ }
+
+ /*
+ * Since we may release fs_info->zoned_meta_io_lock, someone can already
+ * start writing this eb. In that case, we can just bail out.
+ */
+ if (block_group->meta_write_pointer > eb->start)
+ return -EBUSY;
+
+ /* If for_sync, this hole will be filled with trasnsaction commit. */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ return -EAGAIN;
+ return -EBUSY;
}
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
@@ -1879,10 +1963,10 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_space_info *space_info = block_group->space_info;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
+ const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
bool ret;
int i;
@@ -1891,7 +1975,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
- spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -1904,30 +1987,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
+ spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_zoned_device_info *zinfo;
+ int reserved = 0;
+
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
+ zinfo = device->zone_info;
- if (device->zone_info->max_active_zones == 0)
+ if (zinfo->max_active_zones == 0)
continue;
+ if (is_data)
+ reserved = zinfo->reserved_active_zones;
+ /*
+ * For the data block group, leave active zones for one
+ * metadata block group and one system block group.
+ */
+ if (atomic_read(&zinfo->active_zones_left) <= reserved) {
+ ret = false;
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ goto out_unlock;
+ }
+
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
+ spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
+ if (!is_data)
+ zinfo->reserved_active_zones--;
}
+ spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
- WARN_ON(block_group->alloc_offset != 0);
- if (block_group->zone_unusable == block_group->length) {
- block_group->zone_unusable = block_group->length - block_group->zone_capacity;
- space_info->bytes_zone_unusable -= block_group->zone_capacity;
- }
spin_unlock(&block_group->lock);
- btrfs_try_granting_tickets(fs_info, space_info);
- spin_unlock(&space_info->lock);
/* For the active block group list */
btrfs_get_block_group(block_group);
@@ -1940,7 +2037,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
return ret;
}
@@ -2006,6 +2102,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
* and block_group->meta_write_pointer for metadata.
*/
if (!fully_written) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
spin_unlock(&block_group->lock);
ret = btrfs_inc_block_group_ro(block_group, false);
@@ -2034,7 +2134,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
return 0;
}
- if (block_group->reserved) {
+ if (block_group->reserved ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return -EAGAIN;
@@ -2043,6 +2145,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
block_group->alloc_offset = block_group->zone_capacity;
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
+ block_group->meta_write_pointer = block_group->start +
+ block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
btrfs_clear_data_reloc_bg(block_group);
@@ -2052,18 +2157,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
- if (device->zone_info->max_active_zones == 0)
+ if (zinfo->max_active_zones == 0)
continue;
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT,
GFP_NOFS);
if (ret)
return ret;
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
}
@@ -2102,8 +2210,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
+ spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int reserved = 0;
if (!device->bdev)
continue;
@@ -2113,17 +2223,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
break;
}
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ reserved = zinfo->reserved_active_zones;
+
switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- ret = (atomic_read(&zinfo->active_zones_left) >= 1);
+ ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = (atomic_read(&zinfo->active_zones_left) >= 2);
+ ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
break;
}
if (ret)
break;
}
+ spin_unlock(&fs_info->zone_active_bgs_lock);
mutex_unlock(&fs_info->chunk_mutex);
if (!ret)
@@ -2265,7 +2379,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
/* All relocation extents are written. */
if (block_group->start + block_group->alloc_offset == logical + length) {
- /* Now, release this block group for further allocations. */
+ /*
+ * Now, release this block group for further allocations and
+ * zone finish.
+ */
clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
&block_group->runtime_flags);
}
@@ -2289,7 +2406,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
}
@@ -2365,3 +2483,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
return 0;
}
+
+/*
+ * Reserve zones for one metadata block group, one tree-log block group, and one
+ * system block group.
+ */
+void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_block_group *block_group;
+ struct btrfs_device *device;
+ /* Reserve zones for normal SINGLE metadata and tree-log block group. */
+ unsigned int metadata_reserve = 2;
+ /* Reserve a zone for SINGLE system block group. */
+ unsigned int system_reserve = 1;
+
+ if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
+ return;
+
+ /*
+ * This function is called from the mount context. So, there is no
+ * parallel process touching the bits. No need for read_seqretry().
+ */
+ if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
+ metadata_reserve = 4;
+ if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
+ system_reserve = 2;
+
+ /* Apply the reservation on all the devices. */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ device->zone_info->reserved_active_zones =
+ metadata_reserve + system_reserve;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Release reservation for currently active block groups. */
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ struct map_lookup *map = block_group->physical_map;
+
+ if (!(block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
+ continue;
+
+ for (int i = 0; i < map->num_stripes; i++)
+ map->stripes[i].dev->zone_info->reserved_active_zones--;
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+}