diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 593 |
1 files changed, 262 insertions, 331 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6d592870400..03f52e4a20aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->last_flush_error = 0; /* Verify the device is back in a pristine state */ - ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); - ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); - ASSERT(list_empty(&device->dev_alloc_list)); - ASSERT(list_empty(&device->post_commit_list)); + WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + WARN_ON(!list_empty(&device->dev_alloc_list)); + WARN_ON(!list_empty(&device->post_commit_list)); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct block_device *bdev; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_fs_devices *seed_devices; + struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; @@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; + ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; } static void init_alloc_chunk_ctl_policy_zoned( @@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; @@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } write_unlock(&em_tree->lock); - block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); if (IS_ERR(block_group)) goto error_del_extent; @@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); - btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); - btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); @@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ ret = map->num_stripes; free_extent_map(em); - - down_read(&fs_info->dev_replace.rwsem); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && - fs_info->dev_replace.tgtdev) - ret++; - up_read(&fs_info->dev_replace.rwsem); - return ret; } @@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = map->stripe_len * nr_data_stripes(map); + len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; free_extent_map(em); } return len; @@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) -{ - int i; - int again = 1; - - while (again) { - again = 0; - for (i = 0; i < num_stripes - 1; i++) { - /* Swap if parity is on a smaller index */ - if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { - swap(bioc->stripes[i], bioc->stripes[i + 1]); - swap(bioc->raid_map[i], bioc->raid_map[i + 1]); - again = 1; - } - } - } -} - static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - int total_stripes, - int real_stripes) + u16 total_stripes) { - struct btrfs_io_context *bioc = kzalloc( + struct btrfs_io_context *bioc; + + bioc = kzalloc( /* The size of btrfs_io_context */ sizeof(struct btrfs_io_context) + /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes) + - /* Plus the variable array for the tgt dev */ - sizeof(int) * (real_stripes) + - /* - * Plus the raid_map, which includes both the tgt dev - * and the stripes. - */ - sizeof(u64) * (total_stripes), + sizeof(struct btrfs_io_stripe) * (total_stripes), GFP_NOFS); if (!bioc) @@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; - bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); - bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); + bioc->replace_stripe_src = -1; + bioc->full_stripe_logical = (u64)-1; return bioc; } @@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; - u64 stripe_nr; - u64 stripe_nr_end; + u32 stripe_nr; + u32 stripe_nr_end; + u32 stripe_cnt; u64 stripe_end_offset; - u64 stripe_cnt; - u64 stripe_len; u64 stripe_offset; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; - u64 stripes_per_dev = 0; + u32 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; int ret; @@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out_free_map; -} + } offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); *length_ret = length; - stripe_len = map->stripe_len; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - stripe_nr = div64_u64(offset, stripe_len); + stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_nr * stripe_len; + stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); - stripe_nr_end = round_up(offset + length, map->stripe_len); - stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> + BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = stripe_nr_end * map->stripe_len - + stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, factor = map->num_stripes / sub_stripes; *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index = stripe_nr % factor; + stripe_nr /= factor; stripe_index *= sub_stripes; - stripes_per_dev = div_u64_rem(stripe_cnt, factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; + + remaining_stripes = stripe_cnt % factor; + stripes_per_dev = stripe_cnt / factor; + last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { *num_stripes = map->num_stripes; } else { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; } stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); @@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev * map->stripe_len; + stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; if (i / sub_stripes < remaining_stripes) - stripes[i].length += map->stripe_len; + stripes[i].length += BTRFS_STRIPE_LEN; /* * Special for the first stripe and @@ -6103,83 +6068,6 @@ out_free_map: return ERR_PTR(ret); } -/* - * In dev-replace case, for repair case (that's the only case where the mirror - * is selected explicitly when calling btrfs_map_block), blocks left of the - * left cursor can also be read from the target drive. - * - * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the - * array of stripes. - * For READ, it also needs to be supported using the same mirror number. - * - * If the requested block is not left of the left cursor, EIO is returned. This - * can happen because btrfs_num_copies() returns one more in the dev-replace - * case. - */ -static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, - u64 srcdev_devid, int *mirror_num, - u64 *physical) -{ - struct btrfs_io_context *bioc = NULL; - int num_stripes; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - int i; - int ret = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, NULL, NULL, 0); - if (ret) { - ASSERT(bioc == NULL); - return ret; - } - - num_stripes = bioc->num_stripes; - if (*mirror_num > num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, - * that means that the requested area is not left of the left - * cursor - */ - btrfs_put_bioc(bioc); - return -EIO; - } - - /* - * process the rest of the function using the mirror_num of the source - * drive. Therefore look it up first. At the end, patch the device - * pointer to the one of the target drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add the - * mirror with the lowest physical address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - - btrfs_put_bioc(bioc); - - ASSERT(found); - if (!found) - return -EIO; - - *mirror_num = index_srcdev + 1; - *physical = physical_of_found; - return ret; -} - static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; @@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_io_context **bioc_ret, + struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; - int tgtdev_indexes = 0; + /* + * At this stage, num_stripes is still the real number of stripes, + * excluding the duplicated stripes. + */ int num_stripes = *num_stripes_ret; + int nr_extra_stripes = 0; int max_errors = *max_errors_ret; int i; - if (op == BTRFS_MAP_WRITE) { - int index_where_to_add; + /* + * A block group which has "to_copy" set will eventually be copied by + * the dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; - /* - * A block group which have "to_copy" set will eventually - * copied by dev-replace process. We can avoid cloning IO here. - */ - if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) - return; + /* + * Duplicate the write operations while the dev-replace procedure is + * running. Since the copying of the old disk to the new disk takes + * place at run time while the filesystem is mounted writable, the + * regular write operations to the old disk have to be duplicated to go + * to the new disk as well. + * + * Note that device->missing is handled by the caller, and that the + * write to the old disk is already set up in the stripes array. + */ + for (i = 0; i < num_stripes; i++) { + struct btrfs_io_stripe *old = &bioc->stripes[i]; + struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk to - * the new disk takes place at run time while the filesystem is - * mounted writable, the regular write operations to the old - * disk have to be duplicated to go to the new disk as well. - * - * Note that device->missing is handled by the caller, and that - * the write to the old disk is already set up in the stripes - * array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_io_stripe *new = - bioc->stripes + index_where_to_add; - struct btrfs_io_stripe *old = - bioc->stripes + i; - - new->physical = old->physical; - new->dev = dev_replace->tgtdev; - bioc->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; + if (old->dev->devid != srcdev_devid) + continue; - /* - * During the dev-replace procedure, the target drive can also - * be used to read data in case it is needed to repair a corrupt - * block elsewhere. This is possible if the requested area is - * left of the left cursor. In this area, the target drive is a - * full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it simple, - * only add the mirror with the lowest physical - * address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - } - if (found) { - struct btrfs_io_stripe *tgtdev_stripe = - bioc->stripes + num_stripes; + new->physical = old->physical; + new->dev = dev_replace->tgtdev; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + bioc->replace_stripe_src = i; + nr_extra_stripes++; + } + + /* We can only have at most 2 extra nr_stripes (for DUP). */ + ASSERT(nr_extra_stripes <= 2); + /* + * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for + * replace. + * If we have 2 extra stripes, only choose the one with smaller physical. + */ + if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; + struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->dev = dev_replace->tgtdev; - bioc->tgtdev_map[index_srcdev] = num_stripes; + /* Only DUP can have two extra stripes. */ + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); - tgtdev_indexes++; - num_stripes++; + /* + * Swap the last stripe stripes and reduce @nr_extra_stripes. + * The extra stripe would still be there, but won't be accessed. + */ + if (first->physical > second->physical) { + swap(second->physical, first->physical); + swap(second->dev, first->dev); + nr_extra_stripes--; } } - *num_stripes_ret = num_stripes; - *max_errors_ret = max_errors; - bioc->num_tgtdevs = tgtdev_indexes; - *bioc_ret = bioc; + *num_stripes_ret = num_stripes + nr_extra_stripes; + *max_errors_ret = max_errors + nr_extra_stripes; + bioc->replace_nr_stripes = nr_extra_stripes; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op) } static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - u32 stripe_len = map->stripe_len; - ASSERT(op != BTRFS_MAP_DISCARD); /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + unsigned long full_stripe_len = nr_data_stripes(map) << + BTRFS_STRIPE_LEN_SHIFT; + /* + * For full stripe start, we use previously calculated + * @stripe_nr. Align it to nr_data_stripes, then multiply with + * STRIPE_LEN. + * + * By this we can avoid u64 division completely. And we have + * to go rounddown(), not round_down(), as nr_data_stripes is + * not ensured to be power of 2. + */ *full_stripe_start = - div64_u64(offset, full_stripe_len) * full_stripe_len; + rounddown(*stripe_nr, nr_data_stripes(map)) << + BTRFS_STRIPE_LEN_SHIFT; /* * For writes to RAID56, allow to write a full stripe set, but @@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return stripe_len - *stripe_offset; + return BTRFS_STRIPE_LEN - *stripe_offset; return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u64 stripe_nr) + u32 stripe_index, u64 stripe_offset, u32 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct map_lookup *map; u64 map_offset; u64 stripe_offset; - u64 stripe_nr; - u64 stripe_len; + u32 stripe_nr; u32 stripe_index; int data_stripes; int i; int ret = 0; int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; + int num_copies; int max_errors = 0; - int tgtdev_indexes = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; - int num_alloc_stripes; - int patch_the_first_stripe_for_dev_replace = 0; - u64 physical_to_patch_in_first_stripe = 0; + u16 num_alloc_stripes; u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); + if (mirror_num > num_copies) + return -EINVAL; + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); map = em->map_lookup; data_stripes = nr_data_stripes(map); - stripe_len = map->stripe_len; map_offset = logical - em->start; max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, @@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !need_full_stripe(op) && dev_replace->tgtdev != NULL) { - ret = get_extra_mirror_from_replace(fs_info, logical, *length, - dev_replace->srcdev->devid, - &mirror_num, - &physical_to_patch_in_first_stripe); - if (ret) - goto out; - else - patch_the_first_stripe_for_dev_replace = 1; - } else if (mirror_num > map->num_stripes) { - mirror_num = 0; - } - num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { @@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes; - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); - stripe_index *= map->sub_stripes; + stripe_index = (stripe_nr % factor) * map->sub_stripes; + stripe_nr /= factor; if (need_full_stripe(op)) num_stripes = map->sub_stripes; @@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * data_stripes); + /* + * Push stripe_nr back to the start of the full stripe + * For those cases needing a full stripe, @stripe_nr + * is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, + * but that can be expensive. Here we just divide + * @stripe_nr with @data_stripes. + */ + stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - data_stripes * stripe_len) - logical; + (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_nr = div_u64_rem(stripe_nr, - data_stripes, &stripe_index); + stripe_index = stripe_nr % data_stripes; + stripe_nr /= data_stripes; if (mirror_num > 1) stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ - div_u64_rem(stripe_nr + stripe_index, map->num_stripes, - &stripe_index); + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this, stripe_nr is the number of stripes on this + * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; mirror_num = stripe_index + 1; } if (stripe_index >= map->num_stripes) { @@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { - if (op == BTRFS_MAP_WRITE) - num_alloc_stripes <<= 1; - if (op == BTRFS_MAP_GET_READ_MIRRORS) - num_alloc_stripes++; - tgtdev_indexes = num_stripes; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + op != BTRFS_MAP_READ) + /* + * For replace case, we need to add extra stripes for extra + * duplicated stripes. + * + * For both WRITE and GET_READ_MIRRORS, we may have at most + * 2 more stripes (DUP types, otherwise 1). + */ + num_alloc_stripes += 2; /* * If this I/O maps to a single device, try to return the device and @@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && (!need_full_stripe(op) || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { - if (patch_the_first_stripe_for_dev_replace) { - smap->dev = dev_replace->tgtdev; - smap->physical = physical_to_patch_in_first_stripe; - *mirror_num_ret = map->num_stripes + 1; - } else { - set_io_stripe(smap, map, stripe_index, stripe_offset, - stripe_nr); - *mirror_num_ret = mirror_num; - } + set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; } + bioc->map_type = map->type; - for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, - stripe_nr); - stripe_index++; - } - - /* Build raid_map */ + /* + * For RAID56 full map, we need to make sure the stripes[] follows the + * rule that data stripes are all ordered, then followed with P and Q + * (if we have). + * + * It's still mostly the same as other profiles, just with extra rotation. + */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - u64 tmp; - unsigned rot; - - /* Work out the disk rotation on this stripe-set */ - div_u64_rem(stripe_nr, num_stripes, &rot); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * data_stripes; - for (i = 0; i < data_stripes; i++) - bioc->raid_map[(i + rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bioc->raid_map[(i + rot + 1) % num_stripes] = - RAID6_Q_STRIPE; - - sort_parity_stripes(bioc, num_stripes); + /* + * For RAID56 @stripe_nr is already the number of full stripes + * before us, which is also the rotation value (needs to modulo + * with num_stripes). + * + * In this case, we just add @stripe_nr with @i, then do the + * modulo, to reduce one modulo call. + */ + bioc->full_stripe_logical = em->start + + ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + for (i = 0; i < num_stripes; i++) + set_io_stripe(&bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + } else { + /* + * For all other non-RAID56 profiles, just copy the target + * stripe into the bioc. + */ + for (i = 0; i < num_stripes; i++) { + set_io_stripe(&bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + stripe_index++; + } } if (need_full_stripe(op)) @@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, + handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } *bioc_ret = bioc; - bioc->map_type = map->type; bioc->num_stripes = num_stripes; bioc->max_errors = max_errors; bioc->mirror_num = mirror_num; - /* - * this is the case that REQ_READ && dev_replace_is_ongoing && - * mirror_num == num_stripes + 1 && dev_replace target drive is - * available as a mirror - */ - if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { - WARN_ON(num_stripes > 1); - bioc->stripes[0].dev = dev_replace->tgtdev; - bioc->stripes[0].physical = physical_to_patch_in_first_stripe; - bioc->mirror_num = map->num_stripes + 1; - } out: if (dev_replace_is_ongoing) { lockdep_assert_held(&dev_replace->rwsem); @@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); - map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; /* * We can't use the sub_stripes value, as for profiles other than @@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } + +static void map_raid56_repair_block(struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, + u64 logical) +{ + int data_stripes = nr_bioc_data_stripes(bioc); + int i; + + for (i = 0; i < data_stripes; i++) { + u64 stripe_start = bioc->full_stripe_logical + + (i << BTRFS_STRIPE_LEN_SHIFT); + + if (logical >= stripe_start && + logical < stripe_start + BTRFS_STRIPE_LEN) + break; + } + ASSERT(i < data_stripes); + smap->dev = bioc->stripes[i].dev; + smap->physical = bioc->stripes[i].physical + + ((logical - bioc->full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK); +} + +/* + * Map a repair write into a single device. + * + * A repair write is triggered by read time repair or scrub, which would only + * update the contents of a single device. + * Not update any other mirrors nor go through RMW path. + * + * Callers should ensure: + * + * - Call btrfs_bio_counter_inc_blocked() first + * - The range does not cross stripe boundary + * - Has a valid @mirror_num passed in. + */ +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num) +{ + struct btrfs_io_context *bioc = NULL; + u64 map_length = length; + int mirror_ret = mirror_num; + int ret; + + ASSERT(mirror_num > 0); + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); + if (ret < 0) + return ret; + + /* The map range should not cross stripe boundary. */ + ASSERT(map_length >= length); + + /* Already mapped to single stripe. */ + if (!bioc) + goto out; + + /* Map the RAID56 multi-stripe writes to a single one. */ + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + map_raid56_repair_block(bioc, smap, logical); + goto out; + } + + ASSERT(mirror_num <= bioc->num_stripes); + smap->dev = bioc->stripes[mirror_num - 1].dev; + smap->physical = bioc->stripes[mirror_num - 1].physical; +out: + btrfs_put_bioc(bioc); + ASSERT(smap->dev); + return 0; +} |