summaryrefslogtreecommitdiff
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c121
1 files changed, 98 insertions, 23 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..018741ba9310 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
*
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
*/
/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
+ int set;
sector_t s = sector;
- r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
d += geo->near_copies;
- if (d >= geo->raid_disks)
- d -= geo->raid_disks;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
- if (dev < 0)
- dev += geo->raid_disks;
+ if (dev < far_set_start)
+ dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
- if (dev < geo->near_copies)
- dev += geo->raid_disks - geo->near_copies;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1086,7 +1133,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
bio = next;
}
kfree(plug);
@@ -1105,6 +1157,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@@ -1460,7 +1513,8 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1556,8 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -2863,6 +2918,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (init_resync(conf))
return 0;
+ /*
+ * Allow skipping a full rebuild for incremental assembly
+ * of a clean array, like RAID1 does.
+ */
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ max_sector = mddev->dev_sectors;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sector = mddev->resync_max_sectors;
+ return max_sector - sector_nr;
+ }
+
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -3436,7 +3507,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 17)
+ if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3448,6 +3519,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3569,6 +3641,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3757,6 +3831,7 @@ static int stop(struct mddev *mddev)
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
+ safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
mddev->private = NULL;