summaryrefslogtreecommitdiff
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c243
1 files changed, 142 insertions, 101 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4cae41bd6de0..b877203f1dc5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,9 +43,20 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This determines the batch size for stripe submitted in one go.
+ * This detemines how many stripes would be submitted in one go,
+ * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
-#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
+#define SCRUB_STRIPES_PER_GROUP 8
+
+/*
+ * How many groups we have for each sctx.
+ *
+ * This would be 8M per device, the same value as the old scrub in-flight bios
+ * size limit.
+ */
+#define SCRUB_GROUPS_PER_SCTX 16
+
+#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
/*
* The following value times PAGE_SIZE needs to be large enough to match the
@@ -172,9 +183,11 @@ struct scrub_stripe {
};
struct scrub_ctx {
- struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
+ struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES];
struct scrub_stripe *raid56_data_stripes;
struct btrfs_fs_info *fs_info;
+ struct btrfs_path extent_path;
+ struct btrfs_path csum_path;
int first_free;
int cur_stripe;
atomic_t cancel_req;
@@ -315,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
+ for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
release_scrub_stripe(&sctx->stripes[i]);
- kfree(sctx);
+ kvfree(sctx);
}
static void scrub_put_ctx(struct scrub_ctx *sctx)
@@ -333,13 +346,20 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
struct scrub_ctx *sctx;
int i;
- sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
+ /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
+ * kvzalloc().
+ */
+ sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
+ sctx->extent_path.search_commit_root = 1;
+ sctx->extent_path.skip_locking = 1;
+ sctx->csum_path.search_commit_root = 1;
+ sctx->csum_path.skip_locking = 1;
+ for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
int ret;
ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
@@ -605,7 +625,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
btrfs_stack_header_bytenr(header), logical);
return;
}
- if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+ if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0) {
bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
@@ -969,6 +990,9 @@ skip:
spin_unlock(&sctx->stat_lock);
}
+static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
+ unsigned long write_bitmap, bool dev_replace);
+
/*
* The main entrance for all read related scrub work, including:
*
@@ -977,13 +1001,16 @@ skip:
* - Go through the remaining mirrors and try to read as large blocksize as
* possible
* - Go through all mirrors (including the failed mirror) sector-by-sector
+ * - Submit writeback for repaired sectors
*
- * Writeback does not happen here, it needs extra synchronization.
+ * Writeback for dev-replace does not happen here, it needs extra
+ * synchronization for zoned devices.
*/
static void scrub_stripe_read_repair_worker(struct work_struct *work)
{
struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
- struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct scrub_ctx *sctx = stripe->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
int mirror;
@@ -1048,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
goto out;
}
out:
- scrub_stripe_report_errors(stripe->sctx, stripe);
+ /*
+ * Submit the repaired sectors. For zoned case, we cannot do repair
+ * in-place, but queue the bg to be relocated.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
+ } else if (!sctx->readonly) {
+ unsigned long repaired;
+
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+ &stripe->error_bitmap, stripe->nr_sectors);
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ wait_scrub_stripe_io(stripe);
+ }
+
+ scrub_stripe_report_errors(sctx, stripe);
set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
wake_up(&stripe->repair_wait);
}
@@ -1261,7 +1304,6 @@ static int get_raid56_logic_offset(u64 physical, int num,
/* Work out the disk rotation on this stripe-set */
rot = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
@@ -1467,6 +1509,8 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
* Return <0 for error.
*/
static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+ struct btrfs_path *extent_path,
+ struct btrfs_path *csum_path,
struct btrfs_device *dev, u64 physical,
int mirror_num, u64 logical_start,
u32 logical_len,
@@ -1476,7 +1520,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
const u64 logical_end = logical_start + logical_len;
- struct btrfs_path path = { 0 };
u64 cur_logical = logical_start;
u64 stripe_end;
u64 extent_start;
@@ -1492,14 +1535,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
/* The range must be inside the bg. */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
- path.search_commit_root = 1;
- path.skip_locking = 1;
-
- ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+ ret = find_first_extent_item(extent_root, extent_path, logical_start,
+ logical_len);
/* Either error or not found. */
if (ret)
goto out;
- get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+ get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
+ &extent_gen);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
stripe->nr_meta_extents++;
if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
@@ -1527,7 +1569,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
/* Fill the extent info for the remaining sectors. */
while (cur_logical <= stripe_end) {
- ret = find_first_extent_item(extent_root, &path, cur_logical,
+ ret = find_first_extent_item(extent_root, extent_path, cur_logical,
stripe_end - cur_logical + 1);
if (ret < 0)
goto out;
@@ -1535,7 +1577,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
ret = 0;
break;
}
- get_extent_info(&path, &extent_start, &extent_len,
+ get_extent_info(extent_path, &extent_start, &extent_len,
&extent_flags, &extent_gen);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
stripe->nr_meta_extents++;
@@ -1560,9 +1602,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
*/
ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
- ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
- stripe_end, stripe->csums,
- &csum_bitmap, true);
+ ret = btrfs_lookup_csums_bitmap(csum_root, csum_path,
+ stripe->logical, stripe_end,
+ stripe->csums, &csum_bitmap);
if (ret < 0)
goto out;
if (ret > 0)
@@ -1575,7 +1617,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
}
set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
out:
- btrfs_release_path(&path);
return ret;
}
@@ -1653,6 +1694,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
return false;
}
+static void submit_initial_group_read(struct scrub_ctx *sctx,
+ unsigned int first_slot,
+ unsigned int nr_stripes)
+{
+ struct blk_plug plug;
+
+ ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
+ ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);
+
+ scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
+ btrfs_stripe_nr_to_offset(nr_stripes));
+ blk_start_plug(&plug);
+ for (int i = 0; i < nr_stripes; i++) {
+ struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];
+
+ /* Those stripes should be initialized. */
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+ scrub_submit_initial_read(sctx, stripe);
+ }
+ blk_finish_plug(&plug);
+}
+
static int flush_scrub_stripes(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -1665,11 +1728,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
- scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
- btrfs_stripe_nr_to_offset(nr_stripes));
- for (int i = 0; i < nr_stripes; i++) {
- stripe = &sctx->stripes[i];
- scrub_submit_initial_read(sctx, stripe);
+ /* Submit the stripes which are populated but not submitted. */
+ if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
+ const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);
+
+ submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
}
for (int i = 0; i < nr_stripes; i++) {
@@ -1679,32 +1742,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
}
- /*
- * Submit the repaired sectors. For zoned case, we cannot do repair
- * in-place, but queue the bg to be relocated.
- */
- if (btrfs_is_zoned(fs_info)) {
- for (int i = 0; i < nr_stripes; i++) {
- stripe = &sctx->stripes[i];
-
- if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
- btrfs_repair_one_zone(fs_info,
- sctx->stripes[0].bg->start);
- break;
- }
- }
- } else if (!sctx->readonly) {
- for (int i = 0; i < nr_stripes; i++) {
- unsigned long repaired;
-
- stripe = &sctx->stripes[i];
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- }
- }
-
/* Submit for dev-replace. */
if (sctx->is_dev_replace) {
/*
@@ -1749,28 +1786,40 @@ static void raid56_scrub_wait_endio(struct bio *bio)
static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
struct btrfs_device *dev, int mirror_num,
- u64 logical, u32 length, u64 physical)
+ u64 logical, u32 length, u64 physical,
+ u64 *found_logical_ret)
{
struct scrub_stripe *stripe;
int ret;
- /* No available slot, submit all stripes and wait for them. */
- if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
- ret = flush_scrub_stripes(sctx);
- if (ret < 0)
- return ret;
- }
+ /*
+ * There should always be one slot left, as caller filling the last
+ * slot should flush them all.
+ */
+ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
stripe = &sctx->stripes[sctx->cur_stripe];
-
- /* We can queue one stripe using the remaining slot. */
scrub_reset_stripe(stripe);
- ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
- logical, length, stripe);
+ ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
+ &sctx->csum_path, dev, physical,
+ mirror_num, logical, length, stripe);
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
+ if (found_logical_ret)
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
+
+ /* We filled one group, submit it. */
+ if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
+ const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;
+
+ submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
+ }
+
+ /* Last slot used, flush them all. */
+ if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
+ return flush_scrub_stripes(sctx);
return 0;
}
@@ -1784,6 +1833,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_io_context *bioc = NULL;
+ struct btrfs_path extent_path = { 0 };
+ struct btrfs_path csum_path = { 0 };
struct bio *bio;
struct scrub_stripe *stripe;
bool all_empty = true;
@@ -1794,6 +1845,16 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes);
+ /*
+ * For data stripe search, we cannot re-use the same extent/csum paths,
+ * as the data stripe bytenr may be smaller than previous extent. Thus
+ * we have to use our own extent/csum paths.
+ */
+ extent_path.search_commit_root = 1;
+ extent_path.skip_locking = 1;
+ csum_path.search_commit_root = 1;
+ csum_path.skip_locking = 1;
+
for (int i = 0; i < data_stripes; i++) {
int stripe_index;
int rot;
@@ -1808,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
scrub_reset_stripe(stripe);
set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
- ret = scrub_find_fill_first_stripe(bg,
+ ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path,
map->stripes[stripe_index].dev, physical, 1,
full_stripe_start + btrfs_stripe_nr_to_offset(i),
BTRFS_STRIPE_LEN, stripe);
@@ -1853,24 +1914,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
/* For now, no zoned support for RAID56. */
ASSERT(!btrfs_is_zoned(sctx->fs_info));
- /* Writeback for the repaired sectors. */
- for (int i = 0; i < data_stripes; i++) {
- unsigned long repaired;
-
- stripe = &sctx->raid56_data_stripes[i];
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- }
-
- /* Wait for the above writebacks to finish. */
- for (int i = 0; i < data_stripes; i++) {
- stripe = &sctx->raid56_data_stripes[i];
-
- wait_scrub_stripe_io(stripe);
- }
-
/*
* Now all data stripes are properly verified. Check if we have any
* unrepaired, if so abort immediately or we could further corrupt the
@@ -1936,6 +1979,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bio_put(bio);
btrfs_bio_counter_dec(fs_info);
+ btrfs_release_path(&extent_path);
+ btrfs_release_path(&csum_path);
out:
return ret;
}
@@ -1957,18 +2002,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
const u64 logical_end = logical_start + logical_length;
- /* An artificial limit, inherit from old scrub behavior */
- struct btrfs_path path = { 0 };
u64 cur_logical = logical_start;
int ret;
/* The range must be inside the bg */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
- path.search_commit_root = 1;
- path.skip_locking = 1;
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
+ u64 found_logical;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -1993,7 +2035,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
cur_logical, logical_end - cur_logical,
- cur_physical);
+ cur_physical, &found_logical);
if (ret > 0) {
/* No more extent, just update the accounting */
sctx->stat.last_physical = physical + logical_length;
@@ -2003,14 +2045,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
- ASSERT(sctx->cur_stripe > 0);
- cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
- + BTRFS_STRIPE_LEN;
+ cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
cond_resched();
}
- btrfs_release_path(&path);
return ret;
}
@@ -2108,6 +2147,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 stripe_logical;
int stop_loop = 0;
+ /* Extent_path should be released by now. */
+ ASSERT(sctx->extent_path.nodes[0] == NULL);
+
scrub_blocked_if_needed(fs_info);
if (sctx->is_dev_replace &&
@@ -2226,6 +2268,9 @@ out:
ret2 = flush_scrub_stripes(sctx);
if (!ret)
ret = ret2;
+ btrfs_release_path(&sctx->extent_path);
+ btrfs_release_path(&sctx->csum_path);
+
if (sctx->raid56_data_stripes) {
for (int i = 0; i < nr_data_stripes(map); i++)
release_scrub_stripe(&sctx->raid56_data_stripes[i]);
@@ -2710,8 +2755,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
-static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
- int is_dev_replace)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
{
struct workqueue_struct *scrub_workers = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
@@ -2721,10 +2765,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- if (is_dev_replace)
- scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
- else
- scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
if (!scrub_workers)
return -ENOMEM;
@@ -2776,7 +2817,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
- ret = scrub_workers_get(fs_info, is_dev_replace);
+ ret = scrub_workers_get(fs_info);
if (ret)
goto out_free_ctx;