From 6b6c8110e173ce10f2b169d82a6670001f7184d1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:13 +1100 Subject: md/raid1, raid10: move rXbio accounting closer to allocation. When raid1 or raid10 find they will need to allocate a new r1bio/r10bio, in order to work around a known bad block, they account for the allocation well before the allocation is made. This separation makes the correctness less obvious and requires comments. The accounting needs to be a little before: before the first rXbio is submitted, but that is all. So move the accounting down to where it makes more sense. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e89a8d78a9ed..c7c5b2693fc9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1384,18 +1384,8 @@ retry_write: goto retry_write; } - if (max_sectors < r10_bio->sectors) { - /* We are splitting this into multiple parts, so - * we need to prepare for allocating another r10_bio. - */ + if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - } sectors_handled = r10_bio->sector + max_sectors - bio->bi_iter.bi_sector; @@ -1505,10 +1495,16 @@ retry_write: */ if (sectors_handled < bio_sectors(bio)) { - one_write_done(r10_bio); - /* We need another r10_bio. It has already been counted + /* We need another r10_bio and it needs to be counted * in bio->bi_phys_segments. */ + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + one_write_done(r10_bio); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; -- cgit v1.2.3 From fd16f2e8489100eb8005483ff630856bce51f803 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:13 +1100 Subject: md/raid10: stop using bi_phys_segments raid10 currently repurposes bi_phys_segments on each incoming bio to count how many r10bio was used to encode the request. We need to know when the number of attached r10bio reaches zero to: 1/ call bio_endio() when all IO on the bio is finished 2/ decrement ->nr_pending so that resync IO can proceed. Now that the bio has its own __bi_remaining counter, that can be used instead. We can call bio_inc_remaining to increment the counter and call bio_endio() every time an r10bio completes, rather than only when bi_phys_segments reaches zero. This addresses point 1, but not point 2. bio_endio() doesn't (and cannot) report when the last r10bio has finished, so a different approach is needed. So: instead of counting bios in ->nr_pending, count r10bios. i.e. every time we attach a bio, increment nr_pending. Every time an r10bio completes, decrement nr_pending. Normally we only increment nr_pending after first checking that ->barrier is zero, or some other non-trivial tests and possible waiting. When attaching multiple r10bios to a bio, we only need the tests and the waiting once. After the first increment, subsequent increments can happen unconditionally as they are really all part of the one request. So introduce inc_pending() which can be used when we know that nr_pending is already elevated. Note that this fixes a bug. freeze_array() contains the line atomic_read(&conf->nr_pending) == conf->nr_queued+extra, which implies that the units for ->nr_pending, ->nr_queued and extra are the same. ->nr_queue and extra count r10_bios, but prior to this patch, ->nr_pending counted bios. If a bio ever resulted in multiple r10_bios (due to bad blocks), freeze_array() would not work correctly. Now it does. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 76 ++++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 51 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c7c5b2693fc9..0f1b78b38649 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -301,27 +301,18 @@ static void reschedule_retry(struct r10bio *r10_bio) static void raid_end_bio_io(struct r10bio *r10_bio) { struct bio *bio = r10_bio->master_bio; - int done; struct r10conf *conf = r10_bio->mddev->private; - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_error = -EIO; - if (done) { - bio_endio(bio); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } + + bio_endio(bio); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + free_r10bio(r10_bio); } @@ -985,6 +976,15 @@ static void wait_barrier(struct r10conf *conf) spin_unlock_irq(&conf->resync_lock); } +static void inc_pending(struct r10conf *conf) +{ + /* The current request requires multiple r10_bio, so + * we need to increment the pending count. + */ + WARN_ON(!atomic_read(&conf->nr_pending)); + atomic_inc(&conf->nr_pending); +} + static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || @@ -1162,12 +1162,8 @@ read_again: sectors_handled = (r10_bio->sector + max_sectors - bio->bi_iter.bi_sector); r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); + inc_pending(conf); + bio_inc_remaining(bio); /* * Cannot call generic_make_request directly as that will be * queued in __generic_make_request and subsequent @@ -1262,9 +1258,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * on which we have seen a write error, we want to avoid * writing to those blocks. This potentially requires several * writes to write around the bad blocks. Each set of writes - * gets its own r10_bio with a set of bios attached. The number - * of r10_bios is recored in bio->bi_phys_segments just as with - * the read case. + * gets its own r10_bio with a set of bios attached. */ r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ @@ -1495,15 +1489,9 @@ retry_write: */ if (sectors_handled < bio_sectors(bio)) { - /* We need another r10_bio and it needs to be counted - * in bio->bi_phys_segments. - */ - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); + /* We need another r10_bio and it needs to be counted */ + inc_pending(conf); + bio_inc_remaining(bio); one_write_done(r10_bio); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -1532,16 +1520,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - /* - * We might need to issue multiple reads to different devices if there - * are bad blocks around, so we keep track of the number of reads in - * bio->bi_phys_segments. If this is 0, there is only one r10_bio and - * no locking will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); - if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); else @@ -2693,12 +2671,8 @@ read_more: r10_bio->sector + max_sectors - mbio->bi_iter.bi_sector; r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); + bio_inc_remaining(mbio); + inc_pending(conf); generic_make_request(bio); r10_bio = mempool_alloc(conf->r10bio_pool, -- cgit v1.2.3 From 27f26a0f3767b6688b9a88b9becb6f8e760421f3 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 20 Mar 2017 17:46:04 +0800 Subject: md/raid10: refactor some codes from raid10_write_request Previously, we clone both bio and repl_bio in raid10_write_request, then add the cloned bio to plug->pending or conf->pending_bio_list based on plug or not, and most of the logics are same for the two conditions. So introduce raid10_write_one_disk for it, and use replacement parameter to distinguish the difference. No functional changes in the patch. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 175 ++++++++++++++++++++++------------------------------ 1 file changed, 75 insertions(+), 100 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0f1b78b38649..28c62e0c42aa 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1185,18 +1185,82 @@ read_again: return; } -static void raid10_write_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) +static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, + struct bio *bio, bool replacement, + int n_copy, int max_sectors) { - struct r10conf *conf = mddev->private; - int i; const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; - struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid10_plug_cb *plug = NULL; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int devnum = r10_bio->devs[n_copy].devnum; + struct bio *mbio; + + if (replacement) { + rdev = conf->mirrors[devnum].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[devnum].rdev; + } + } else + rdev = conf->mirrors[devnum].rdev; + + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); + if (replacement) + r10_bio->devs[n_copy].repl_bio = mbio; + else + r10_bio->devs[n_copy].bio = mbio; + + mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + + choose_data_offset(r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + bio_set_op_attrs(mbio, op, do_sync | do_fua); + if (!replacement && test_bit(FailFast, + &conf->mirrors[devnum].rdev->flags) + && enough(conf, devnum)) + mbio->bi_opf |= MD_FAILFAST; + mbio->bi_private = r10_bio; + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + mbio, disk_devt(conf->mddev->gendisk), + r10_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void *)rdev; + + atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); +} + +static void raid10_write_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int i; + struct md_rdev *blocked_rdev; sector_t sectors; int sectors_handled; int max_sectors; @@ -1387,101 +1451,12 @@ retry_write: bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { - struct bio *mbio; - int d = r10_bio->devs[i].devnum; - if (r10_bio->devs[i].bio) { - struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; - - mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) && - enough(conf, d)) - mbio->bi_opf |= MD_FAILFAST; - mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), - mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); - /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)rdev; - - atomic_inc(&r10_bio->remaining); - - cb = blk_check_plugged(raid10_unplug, mddev, - sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, - cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - } - - if (r10_bio->devs[i].repl_bio) { - struct md_rdev *rdev = conf->mirrors[d].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + - choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua); - mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), - mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); - /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)rdev; - - atomic_inc(&r10_bio->remaining); - - cb = blk_check_plugged(raid10_unplug, mddev, - sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, - cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - } + if (r10_bio->devs[i].bio) + raid10_write_one_disk(mddev, r10_bio, bio, false, + i, max_sectors); + if (r10_bio->devs[i].repl_bio) + raid10_write_one_disk(mddev, r10_bio, bio, true, + i, max_sectors); } /* Don't remove the bias on 'remaining' (one_write_done) until -- cgit v1.2.3 From c85ba149de4bd14aa028ac824f9f12aeded28b86 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:22 +0800 Subject: md: raid1/raid10: don't handle failure of bio_add_page() All bio_add_page() is for adding one page into resync bio, which is big enough to hold RESYNC_PAGES pages, and the current bio_add_page() doesn't check queue limit any more, so it won't fail at all. remove unused label (shaohua) Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 22 ++++++---------------- drivers/md/raid10.c | 41 ++++++++++------------------------------- 2 files changed, 16 insertions(+), 47 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 941f81063891..569f501fb710 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2894,28 +2894,18 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io) { page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0) == 0) { - /* stop here */ - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - while (i > 0) { - i--; - bio = r1_bio->bios[i]; - if (bio->bi_end_io==NULL) - continue; - /* remove last page from this bio */ - bio->bi_vcnt--; - bio->bi_iter.bi_size -= len; - bio_clear_flag(bio, BIO_SEG_VALID); - } - goto bio_full; - } + + /* + * won't fail because the vec table is big + * enough to hold all these pages + */ + bio_add_page(bio, page, len, 0); } } nr_sectors += len>>9; sector_nr += len>>9; sync_blocks -= (len>>9); } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); - bio_full: r1_bio->sectors = nr_sectors; if (mddev_is_clustered(mddev) && diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 28c62e0c42aa..6c9783ba85db 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3413,27 +3413,16 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { - struct bio *bio2; page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0)) - continue; - - /* stop here */ - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; - bio2 && bio2 != bio; - bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_iter.bi_size -= len; - bio_clear_flag(bio2, BIO_SEG_VALID); - } - goto bio_full; + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); } nr_sectors += len>>9; sector_nr += len>>9; } while (biolist->bi_vcnt < RESYNC_PAGES); - bio_full: r10_bio->sectors = nr_sectors; while (biolist) { @@ -4502,25 +4491,15 @@ read_more: if (len > PAGE_SIZE) len = PAGE_SIZE; for (bio = blist; bio ; bio = bio->bi_next) { - struct bio *bio2; - if (bio_add_page(bio, page, len, 0)) - continue; - - /* Didn't fit, must stop */ - for (bio2 = blist; - bio2 && bio2 != bio; - bio2 = bio2->bi_next) { - /* Remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_iter.bi_size -= len; - bio_clear_flag(bio2, BIO_SEG_VALID); - } - goto bio_full; + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); } sector_nr += len >> 9; nr_sectors += len >> 9; } -bio_full: rcu_read_unlock(); r10_bio->sectors = nr_sectors; -- cgit v1.2.3 From d8e29fbc3bed181f2653fb89ac8c34e40db39c30 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:23 +0800 Subject: md: move two macros into md.h Both raid1 and raid10 share common resync block size and page count, so move them into md.h. Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/md.h | 5 +++++ drivers/md/raid1.c | 2 -- drivers/md/raid10.c | 3 --- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/md.h b/drivers/md/md.h index 7a7847d1cc39..31d2d70849b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -718,4 +718,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } + +/* Maximum size of each resync request */ +#define RESYNC_BLOCK_SIZE (64*1024) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) + #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 569f501fb710..c31f9e206148 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -95,10 +95,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) kfree(r1_bio); } -#define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6c9783ba85db..301e73fe2d2f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } -/* Maximum size of each resync request */ -#define RESYNC_BLOCK_SIZE (64*1024) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ -- cgit v1.2.3 From 81fa152008ac903877b59bcc7d16777c3292c206 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:32 +0800 Subject: md: raid10: refactor code of read reshape's .bi_end_io reshape read request is a bit special and requires one extra bio which isn't allocated from r10buf_pool. Refactor the .bi_end_io for read reshape, so that we can use raid10's resync page mangement approach easily in the following patches. Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 301e73fe2d2f..94f5c368c683 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1889,17 +1889,9 @@ abort: return err; } -static void end_sync_read(struct bio *bio) +static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { - struct r10bio *r10_bio = bio->bi_private; struct r10conf *conf = r10_bio->mddev->private; - int d; - - if (bio == r10_bio->master_bio) { - /* this is a reshape read */ - d = r10_bio->read_slot; /* really the read dev */ - } else - d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (!bio->bi_error) set_bit(R10BIO_Uptodate, &r10_bio->state); @@ -1923,6 +1915,22 @@ static void end_sync_read(struct bio *bio) } } +static void end_sync_read(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); + + __end_sync_read(r10_bio, bio, d); +} + +static void end_reshape_read(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + + __end_sync_read(r10_bio, bio, r10_bio->read_slot); +} + static void end_sync_request(struct r10bio *r10_bio) { struct mddev *mddev = r10_bio->mddev; @@ -4438,7 +4446,7 @@ read_more: read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset); read_bio->bi_private = r10_bio; - read_bio->bi_end_io = end_sync_read; + read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); read_bio->bi_error = 0; -- cgit v1.2.3 From f0250618361db1447d66c494c6dd2df815f42c87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:33 +0800 Subject: md: raid10: don't use bio's vec table to manage resync pages Now we allocate one page array for managing resync pages, instead of using bio's vec table to do that, and the old way is very hacky and won't work any more if multipage bvec is enabled. The introduced cost is that we need to allocate (128 + 16) * copies bytes per r10_bio, and it is fine because the inflight r10_bio for resync shouldn't be much, as pointed by Shaohua. Also bio_reset() in raid10_sync_request() and reshape_request() are removed because all bios are freshly new now in these functions and not necessary to reset any more. This patch can be thought as cleanup too. Suggested-by: Shaohua Li Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 134 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 52 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 94f5c368c683..629ae45bb18e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf); #define raid10_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) +/* + * 'strct resync_pages' stores actual pages used for doing the resync + * IO, and it is per-bio, so make .bi_private points to it. + */ +static inline struct resync_pages *get_resync_pages(struct bio *bio) +{ + return bio->bi_private; +} + +/* + * for resync bio, r10bio pointer can be retrieved from the per-bio + * 'struct resync_pages'. + */ +static inline struct r10bio *get_resync_r10bio(struct bio *bio) +{ + return get_resync_pages(bio)->raid_bio; +} + static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; @@ -140,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data) static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - struct page *page; struct r10bio *r10_bio; struct bio *bio; - int i, j; - int nalloc; + int j; + int nalloc, nalloc_rp; + struct resync_pages *rps; r10_bio = r10bio_pool_alloc(gfp_flags, conf); if (!r10_bio) @@ -156,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) else nalloc = 2; /* recovery */ + /* allocate once for all bios */ + if (!conf->have_replacement) + nalloc_rp = nalloc; + else + nalloc_rp = nalloc * 2; + rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags); + if (!rps) + goto out_free_r10bio; + /* * Allocate bios. */ @@ -175,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate RESYNC_PAGES data pages and attach them * where needed. */ - for (j = 0 ; j < nalloc; j++) { + for (j = 0; j < nalloc; j++) { struct bio *rbio = r10_bio->devs[j].repl_bio; + struct resync_pages *rp, *rp_repl; + + rp = &rps[j]; + if (rbio) + rp_repl = &rps[nalloc + j]; + bio = r10_bio->devs[j].bio; - for (i = 0; i < RESYNC_PAGES; i++) { - if (j > 0 && !test_bit(MD_RECOVERY_SYNC, - &conf->mddev->recovery)) { - /* we can share bv_page's during recovery - * and reshape */ - struct bio *rbio = r10_bio->devs[0].bio; - page = rbio->bi_io_vec[i].bv_page; - get_page(page); - } else - page = alloc_page(gfp_flags); - if (unlikely(!page)) + + if (!j || test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + if (resync_alloc_pages(rp, gfp_flags)) goto out_free_pages; + } else { + memcpy(rp, &rps[0], sizeof(*rp)); + resync_get_all_pages(rp); + } - bio->bi_io_vec[i].bv_page = page; - if (rbio) - rbio->bi_io_vec[i].bv_page = page; + rp->idx = 0; + rp->raid_bio = r10_bio; + bio->bi_private = rp; + if (rbio) { + memcpy(rp_repl, rp, sizeof(*rp)); + rbio->bi_private = rp_repl; } } return r10_bio; out_free_pages: - for ( ; i > 0 ; i--) - safe_put_page(bio->bi_io_vec[i-1].bv_page); - while (j--) - for (i = 0; i < RESYNC_PAGES ; i++) - safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + while (--j >= 0) + resync_free_pages(&rps[j * 2]); + j = 0; out_free_bio: for ( ; j < nalloc; j++) { @@ -213,30 +244,34 @@ out_free_bio: if (r10_bio->devs[j].repl_bio) bio_put(r10_bio->devs[j].repl_bio); } + kfree(rps); +out_free_r10bio: r10bio_pool_free(r10_bio, conf); return NULL; } static void r10buf_pool_free(void *__r10_bio, void *data) { - int i; struct r10conf *conf = data; struct r10bio *r10bio = __r10_bio; int j; + struct resync_pages *rp = NULL; - for (j=0; j < conf->copies; j++) { + for (j = conf->copies; j--; ) { struct bio *bio = r10bio->devs[j].bio; - if (bio) { - for (i = 0; i < RESYNC_PAGES; i++) { - safe_put_page(bio->bi_io_vec[i].bv_page); - bio->bi_io_vec[i].bv_page = NULL; - } - bio_put(bio); - } + + rp = get_resync_pages(bio); + resync_free_pages(rp); + bio_put(bio); + bio = r10bio->devs[j].repl_bio; if (bio) bio_put(bio); } + + /* resync pages array stored in the 1st bio's .bi_private */ + kfree(rp); + r10bio_pool_free(r10bio, conf); } @@ -1917,7 +1952,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) static void end_sync_read(struct bio *bio) { - struct r10bio *r10_bio = bio->bi_private; + struct r10bio *r10_bio = get_resync_r10bio(bio); struct r10conf *conf = r10_bio->mddev->private; int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); @@ -1926,6 +1961,7 @@ static void end_sync_read(struct bio *bio) static void end_reshape_read(struct bio *bio) { + /* reshape read bio isn't allocated from r10buf_pool */ struct r10bio *r10_bio = bio->bi_private; __end_sync_read(r10_bio, bio, r10_bio->read_slot); @@ -1960,7 +1996,7 @@ static void end_sync_request(struct r10bio *r10_bio) static void end_sync_write(struct bio *bio) { - struct r10bio *r10_bio = bio->bi_private; + struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; @@ -2040,6 +2076,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) for (i=0 ; i < conf->copies ; i++) { int j, d; struct md_rdev *rdev; + struct resync_pages *rp; tbio = r10_bio->devs[i].bio; @@ -2081,11 +2118,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ + rp = get_resync_pages(tbio); bio_reset(tbio); tbio->bi_vcnt = vcnt; tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; - tbio->bi_private = r10_bio; + rp->raid_bio = r10_bio; + tbio->bi_private = rp; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; tbio->bi_end_io = end_sync_write; bio_set_op_attrs(tbio, REQ_OP_WRITE, 0); @@ -3149,10 +3188,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } bio = r10_bio->devs[0].bio; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); if (test_bit(FailFast, &rdev->flags)) @@ -3176,10 +3213,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!test_bit(In_sync, &mrdev->flags)) { bio = r10_bio->devs[1].bio; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr @@ -3204,10 +3239,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mreplace == NULL || bio == NULL || test_bit(Faulty, &mreplace->flags)) break; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + @@ -3329,7 +3362,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio_reset(bio); bio->bi_error = -EIO; rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[d].rdev); @@ -3354,7 +3386,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) @@ -3373,13 +3404,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; - bio_reset(bio); bio->bi_error = -EIO; sector = r10_bio->devs[i].addr; bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) @@ -3418,7 +3447,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { - page = bio->bi_io_vec[bio->bi_vcnt].bv_page; + struct resync_pages *rp = get_resync_pages(bio); + page = resync_fetch_page(rp, rp->idx++); /* * won't fail because the vec table is big enough * to hold all these pages @@ -3427,7 +3457,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } nr_sectors += len>>9; sector_nr += len>>9; - } while (biolist->bi_vcnt < RESYNC_PAGES); + } while (get_resync_pages(biolist)->idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; while (biolist) { @@ -3435,7 +3465,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = biolist->bi_next; bio->bi_next = NULL; - r10_bio = bio->bi_private; + r10_bio = get_resync_r10bio(bio); r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { @@ -4326,6 +4356,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, struct bio *blist; struct bio *bio, *read_bio; int sectors_done = 0; + struct page **pages; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -4476,11 +4507,9 @@ read_more: if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; - bio_reset(b); b->bi_bdev = rdev2->bdev; b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; - b->bi_private = r10_bio; b->bi_end_io = end_reshape_write; bio_set_op_attrs(b, REQ_OP_WRITE, 0); b->bi_next = blist; @@ -4490,8 +4519,9 @@ read_more: /* Now add as many pages as possible to all of these bios. */ nr_sectors = 0; + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { - struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; + struct page *page = pages[s / (PAGE_SIZE >> 9)]; int len = (max_sectors - s) << 9; if (len > PAGE_SIZE) len = PAGE_SIZE; @@ -4675,7 +4705,7 @@ static int handle_reshape_read_error(struct mddev *mddev, static void end_reshape_write(struct bio *bio) { - struct r10bio *r10_bio = bio->bi_private; + struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; -- cgit v1.2.3 From cdb76be31568604f389f951bd0efadd3f530f2dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:34 +0800 Subject: md: raid10: retrieve page from preallocated resync page array Now one page array is allocated for each resync bio, and we can retrieve page from this table directly. Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 629ae45bb18e..827bb5bef53c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2055,6 +2055,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) int i, first; struct bio *tbio, *fbio; int vcnt; + struct page **tpages, **fpages; atomic_set(&r10_bio->remaining, 1); @@ -2070,6 +2071,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) fbio = r10_bio->devs[i].bio; fbio->bi_iter.bi_size = r10_bio->sectors << 9; fbio->bi_iter.bi_idx = 0; + fpages = get_resync_pages(fbio)->pages; vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); /* now find blocks with errors */ @@ -2084,6 +2086,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; if (i == first) continue; + + tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; rdev = conf->mirrors[d].rdev; if (!r10_bio->devs[i].bio->bi_error) { @@ -2096,8 +2100,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) int len = PAGE_SIZE; if (sectors < (len / 512)) len = sectors * 512; - if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), - page_address(tbio->bi_io_vec[j].bv_page), + if (memcmp(page_address(fpages[j]), + page_address(tpages[j]), len)) break; sectors -= len/512; @@ -2195,6 +2199,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) int idx = 0; int dr = r10_bio->devs[0].devnum; int dw = r10_bio->devs[1].devnum; + struct page **pages = get_resync_pages(bio)->pages; while (sectors) { int s = sectors; @@ -2210,7 +2215,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ok = sync_page_io(rdev, addr, s << 9, - bio->bi_io_vec[idx].bv_page, + pages[idx], REQ_OP_READ, 0, false); if (ok) { rdev = conf->mirrors[dw].rdev; @@ -2218,7 +2223,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ok = sync_page_io(rdev, addr, s << 9, - bio->bi_io_vec[idx].bv_page, + pages[idx], REQ_OP_WRITE, 0, false); if (!ok) { set_bit(WriteErrorSeen, &rdev->flags); -- cgit v1.2.3 From 2d06e3b7145bb08705615e6e7400024d8e36a5c0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Mar 2017 00:12:35 +0800 Subject: md: raid10: avoid direct access to bvec table in handle_reshape_read_error All reshape I/O share pages from 1st copy device, so just use that pages for avoiding direct access to bvec table in handle_reshape_read_error. Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 827bb5bef53c..0f13d57ef646 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4653,7 +4653,10 @@ static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10b = &on_stack.r10_bio; int slot = 0; int idx = 0; - struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; + struct page **pages; + + /* reshape IOs share pages from .devs[0].bio */ + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; r10b->sector = r10_bio->sector; __raid10_find_phys(&conf->prev, r10b); @@ -4682,7 +4685,7 @@ static int handle_reshape_read_error(struct mddev *mddev, success = sync_page_io(rdev, addr, s << 9, - bvec[idx].bv_page, + pages[idx], REQ_OP_READ, 0, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); -- cgit v1.2.3 From 6f287ca6046edd34ed83aafb7f9033c9c2e809e2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 6 Apr 2017 09:12:18 +0800 Subject: md/raid10: reset the 'first' at the end of loop We need to set "first = 0' at the end of rdev_for_each loop, so we can get the array's min_offset_diff correctly otherwise min_offset_diff just means the last rdev's offset diff. Suggested-by: NeilBrown Signed-off-by: Guoqing Jiang Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0f13d57ef646..e055ec94b9a8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3769,6 +3769,7 @@ static int raid10_run(struct mddev *mddev) if (blk_queue_discard(bdev_get_queue(rdev->bdev))) discard_supported = true; + first = 0; } if (mddev->queue) { @@ -4172,6 +4173,7 @@ static int raid10_start_reshape(struct mddev *mddev) if (first || diff < min_offset_diff) min_offset_diff = diff; } + first = 0; } if (max(before_length, after_length) > min_offset_diff) -- cgit v1.2.3 From fc9977dd069e4f82fcacb262652117c488647319 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 5 Apr 2017 14:05:51 +1000 Subject: md/raid10: simplify the splitting of requests. raid10 splits requests in two different ways for two different reasons. First, bio_split() is used to ensure the bio fits with a chunk. Second, multiple r10bio structures are allocated to represent the different sections that need to go to different devices, to avoid known bad blocks. This can be simplified to just use bio_split() once, and not to use multiple r10bios. We delay the split until we know a maximum bio size that can be handled with a single r10bio, and then split the bio and queue the remainder for later handling. As with raid1, we allocate a new bio_set to help with the splitting. It is not correct to use fs_bio_set in a device driver. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 164 ++++++++++++++++------------------------------------ drivers/md/raid10.h | 1 + 2 files changed, 51 insertions(+), 114 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e055ec94b9a8..41845bae67be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *read_bio; const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - int sectors_handled; int max_sectors; sector_t sectors; struct md_rdev *rdev; @@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ wait_barrier(conf); - sectors = bio_sectors(bio); + sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { @@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, wait_barrier(conf); } -read_again: rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { raid_end_bio_io(r10_bio); return; } + if (max_sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, max_sectors, + GFP_NOIO, conf->bio_split); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + r10_bio->master_bio = bio; + r10_bio->sectors = max_sectors; + } slot = r10_bio->read_slot; read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1186,40 +1191,13 @@ read_again: trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), read_bio, disk_devt(mddev->gendisk), r10_bio->sector); - if (max_sectors < r10_bio->sectors) { - /* - * Could not read all from this device, so we will need another - * r10_bio. - */ - sectors_handled = (r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r10_bio->sectors = max_sectors; - inc_pending(conf); - bio_inc_remaining(bio); - /* - * Cannot call generic_make_request directly as that will be - * queued in __generic_make_request and subsequent - * mempool_alloc might block waiting for it. so hand bio over - * to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); + generic_make_request(read_bio); return; } static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, - int n_copy, int max_sectors) + int n_copy) { const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); @@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, rdev = conf->mirrors[devnum].rdev; mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); if (replacement) r10_bio->devs[n_copy].repl_bio = mbio; else @@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i; struct md_rdev *blocked_rdev; sector_t sectors; - int sectors_handled; int max_sectors; md_write_start(mddev, bio); @@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, */ wait_barrier(conf); - sectors = bio_sectors(bio); + sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { @@ -1476,44 +1452,29 @@ retry_write: if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; - sectors_handled = r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector; + + if (r10_bio->sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, r10_bio->sectors, + GFP_NOIO, conf->bio_split); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + r10_bio->master_bio = bio; + } atomic_set(&r10_bio->remaining, 1); bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) - raid10_write_one_disk(mddev, r10_bio, bio, false, - i, max_sectors); + raid10_write_one_disk(mddev, r10_bio, bio, false, i); if (r10_bio->devs[i].repl_bio) - raid10_write_one_disk(mddev, r10_bio, bio, true, - i, max_sectors); - } - - /* Don't remove the bias on 'remaining' (one_write_done) until - * after checking if we need to go around again. - */ - - if (sectors_handled < bio_sectors(bio)) { - /* We need another r10_bio and it needs to be counted */ - inc_pending(conf); - bio_inc_remaining(bio); - one_write_done(r10_bio); - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; - r10_bio->state = 0; - goto retry_write; + raid10_write_one_disk(mddev, r10_bio, bio, true, i); } one_write_done(r10_bio); } -static void __make_request(struct mddev *mddev, struct bio *bio) +static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; @@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio); + r10_bio->sectors = sectors; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; @@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) struct r10conf *conf = mddev->private; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); int chunk_sects = chunk_mask + 1; - - struct bio *split; + int sectors = bio_sectors(bio); if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } - do { - - /* - * If this request crosses a chunk boundary, we need to split - * it. - */ - if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + - bio_sectors(bio) > chunk_sects - && (conf->geo.near_copies < conf->geo.raid_disks - || conf->prev.near_copies < - conf->prev.raid_disks))) { - split = bio_split(bio, chunk_sects - - (bio->bi_iter.bi_sector & - (chunk_sects - 1)), - GFP_NOIO, fs_bio_set); - bio_chain(split, bio); - } else { - split = bio; - } - - /* - * If a bio is splitted, the first part of bio will pass - * barrier but the bio is queued in current->bio_list (see - * generic_make_request). If there is a raise_barrier() called - * here, the second part of bio can't pass barrier. But since - * the first part bio isn't dispatched to underlaying disks - * yet, the barrier is never released, hence raise_barrier will - * alays wait. We have a deadlock. - * Note, this only happens in read path. For write path, the - * first part of bio is dispatched in a schedule() call - * (because of blk plug) or offloaded to raid10d. - * Quitting from the function immediately can change the bio - * order queued in bio_list and avoid the deadlock. - */ - __make_request(mddev, split); - if (split != bio && bio_data_dir(bio) == READ) { - generic_make_request(bio); - break; - } - } while (split != bio); + /* + * If this request crosses a chunk boundary, we need to split + * it. + */ + if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + + sectors > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < + conf->prev.raid_disks))) + sectors = chunk_sects - + (bio->bi_iter.bi_sector & + (chunk_sects - 1)); + __make_request(mddev, bio, sectors); /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread) recovery_request_write(mddev, r10_bio); else if (test_bit(R10BIO_ReadError, &r10_bio->state)) handle_read_error(mddev, r10_bio); - else { - /* just a partial read to be scheduled from a - * separate context - */ - int slot = r10_bio->read_slot; - generic_make_request(r10_bio->devs[slot].bio); - } + else + WARN_ON_ONCE(1); cond_resched(); if (mddev->sb_flags & ~(1<r10bio_pool) goto out; + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!conf->bio_split) + goto out; + calc_sectors(conf, mddev->dev_sectors); if (mddev->reshape_position == MaxSector) { conf->prev = conf->geo; @@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf); } return ERR_PTR(err); @@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf->mirrors); kfree(conf->mirrors_old); kfree(conf->mirrors_new); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf); } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3162615e57bd..735ce1a3d260 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -82,6 +82,7 @@ struct r10conf { mempool_t *r10bio_pool; mempool_t *r10buf_pool; struct page *tmppage; + struct bio_set *bio_split; /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. -- cgit v1.2.3 From 545250f2480911f053b092d4229d9f83a9dff222 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 5 Apr 2017 14:05:51 +1000 Subject: md/raid10: simplify handle_read_error() handle_read_error() duplicates a lot of the work that raid10_read_request() does, so it makes sense to just use that function. handle_read_error() relies on the same r10bio being re-used so that, in the case of a read-only array, setting IO_BLOCKED in r1bio->devs[].bio ensures read_balance() won't re-use that device. So when called from raid10_make_request() we clear that array, but not when called from handle_read_error(). Two parts of handle_read_error() that need to be preserved are the warning message it prints, so they are conditionally added to raid10_read_request(). If the failing rdev can be found, messages are printed. Otherwise they aren't. Not that as rdev_dec_pending() has already been called on the failing rdev, we need to use rcu_read_lock() to get a new reference from the conf. We only use this to get the name of the failing block device. With this change, we no longer need inc_pending(). Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 122 ++++++++++++++++++++-------------------------------- 1 file changed, 47 insertions(+), 75 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 41845bae67be..4167091eff9a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1008,15 +1008,6 @@ static void wait_barrier(struct r10conf *conf) spin_unlock_irq(&conf->resync_lock); } -static void inc_pending(struct r10conf *conf) -{ - /* The current request requires multiple r10_bio, so - * we need to increment the pending count. - */ - WARN_ON(!atomic_read(&conf->nr_pending)); - atomic_inc(&conf->nr_pending); -} - static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || @@ -1130,8 +1121,38 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int max_sectors; sector_t sectors; struct md_rdev *rdev; - int slot; + char b[BDEVNAME_SIZE]; + int slot = r10_bio->read_slot; + struct md_rdev *err_rdev = NULL; + gfp_t gfp = GFP_NOIO; + + if (r10_bio->devs[slot].rdev) { + /* + * This is an error retry, but we cannot + * safely dereference the rdev in the r10_bio, + * we must use the one in conf. + * If it has already been disconnected (unlikely) + * we lose the device name in error messages. + */ + int disk; + /* + * As we are blocking raid10, it is a little safer to + * use __GFP_HIGH. + */ + gfp = GFP_NOIO | __GFP_HIGH; + rcu_read_lock(); + disk = r10_bio->devs[slot].devnum; + err_rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (err_rdev) + bdevname(err_rdev->bdev, b); + else { + strcpy(b, "???"); + /* This never gets dereferenced */ + err_rdev = r10_bio->devs[slot].rdev; + } + rcu_read_unlock(); + } /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1158,12 +1179,22 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { + if (err_rdev) { + pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + } raid_end_bio_io(r10_bio); return; } + if (err_rdev) + pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, conf->bio_split); + gfp, conf->bio_split); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -1172,7 +1203,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + read_bio = bio_clone_fast(bio, gfp, mddev->bio_set); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1487,6 +1518,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -2556,9 +2588,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - char b[BDEVNAME_SIZE]; - unsigned long do_sync; - int max_sectors; dev_t bio_dev; sector_t bio_last_sector; @@ -2571,7 +2600,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); bio_dev = bio->bi_bdev->bd_dev; bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); @@ -2587,65 +2615,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); - -read_more: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (rdev == NULL) { - pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", - mdname(mddev), b, - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - return; - } - - do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); - slot = r10_bio->read_slot; - pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", - mdname(mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); - bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); - r10_bio->devs[slot].bio = bio; - r10_bio->devs[slot].rdev = rdev; - bio->bi_iter.bi_sector = r10_bio->devs[slot].addr - + choose_data_offset(r10_bio, rdev); - bio->bi_bdev = rdev->bdev; - bio_set_op_attrs(bio, REQ_OP_READ, do_sync); - if (test_bit(FailFast, &rdev->flags) && - test_bit(R10BIO_FailFast, &r10_bio->state)) - bio->bi_opf |= MD_FAILFAST; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, bio_dev, - bio_last_sector - r10_bio->sectors); - - if (max_sectors < r10_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r10_bio->master_bio; - int sectors_handled = - r10_bio->sector + max_sectors - - mbio->bi_iter.bi_sector; - r10_bio->sectors = max_sectors; - bio_inc_remaining(mbio); - inc_pending(conf); - generic_make_request(bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, - GFP_NOIO); - r10_bio->master_bio = mbio; - r10_bio->sectors = bio_sectors(mbio) - sectors_handled; - r10_bio->state = 0; - set_bit(R10BIO_ReadError, - &r10_bio->state); - r10_bio->mddev = mddev; - r10_bio->sector = mbio->bi_iter.bi_sector - + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); + allow_barrier(conf); + r10_bio->state = 0; + raid10_read_request(mddev, r10_bio->master_bio, r10_bio); } static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) -- cgit v1.2.3 From cf25ae78fc50010f66b9be945017796da34c434d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 17 Apr 2017 17:11:05 +0800 Subject: md/raid10: wait up frozen array in handle_write_completed Since nr_queued is changed, we need to call wake_up here if the array is already frozen and waiting for condition "nr_pending == nr_queued + extra" to be true. And commit 824e47daddbf ("RAID1: avoid unnecessary spin locks in I/O barrier code") which has already added the wake_up for raid1. Signed-off-by: Guoqing Jiang Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4167091eff9a..acb3f46f522f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2704,6 +2704,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) list_add(&r10_bio->retry_list, &conf->bio_end_io_list); conf->nr_queued++; spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * nr_pending == nr_queued + extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R10BIO_WriteError, -- cgit v1.2.3 From 296617581eac713b3fda588216ae6d16d1e76dd5 Mon Sep 17 00:00:00 2001 From: Lidong Zhong Date: Fri, 21 Apr 2017 15:21:38 +0800 Subject: md/raid1/10: remove unused queue A queue is declared and get from the disk of the array, but it's not used anywhere. So removing it from the source. Signed-off-by: Lidong Zhong Acted-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 3 --- drivers/md/raid10.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 57611f43ed6c..14a9d36b25b8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2961,7 +2961,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); rdev_for_each(rdev, mddev) { - struct request_queue *q; int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2974,8 +2973,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev) goto abort; disk->rdev = rdev; - q = bdev_get_queue(rdev->bdev); - disk->head_position = 0; disk->seq_start = MaxSector; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index acb3f46f522f..5de951bcd24c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3644,7 +3644,6 @@ static int raid10_run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; - struct request_queue *q; disk_idx = rdev->raid_disk; if (disk_idx < 0) @@ -3663,7 +3662,6 @@ static int raid10_run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } - q = bdev_get_queue(rdev->bdev); diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; -- cgit v1.2.3 From e5bc9c3c5432f5531a58e6fdd9f6c6587f2137b3 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 24 Apr 2017 15:58:04 +0800 Subject: md: clear WantReplacement once disk is removed We can clear 'WantReplacement' flag directly no matter it's replacement existed or not since the semantic is same as before. Also since the disk is removed from array, then it is straightforward to remove 'WantReplacement' flag and the comments in raid10/5 can be removed as well. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 8 ++------ drivers/md/raid5.c | 9 +++------ 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 14a9d36b25b8..70a596c10306 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1831,9 +1831,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) p->rdev = repl; conf->mirrors[conf->raid_disks + number].rdev = NULL; unfreeze_array(conf); - clear_bit(WantReplacement, &rdev->flags); - } else - clear_bit(WantReplacement, &rdev->flags); + } + + clear_bit(WantReplacement, &rdev->flags); err = md_integrity_register(mddev); } abort: diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5de951bcd24c..2883b720a265 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1874,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * but will never see neither -- if they are careful. */ p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); - } else - /* We might have just remove the Replacement as faulty - * Clear the flag just in case - */ - clear_bit(WantReplacement, &rdev->flags); + } + clear_bit(WantReplacement, &rdev->flags); err = md_integrity_register(mddev); abort: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 356cd9c7c753..3d971e5a1b0e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7603,15 +7603,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * but will never see neither - if they are careful */ p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); if (!err) err = log_modify(conf, p->rdev, true); - } else - /* We might have just removed the Replacement as faulty- - * clear the bit just in case - */ - clear_bit(WantReplacement, &rdev->flags); + } + + clear_bit(WantReplacement, &rdev->flags); abort: print_raid5_conf(conf); -- cgit v1.2.3 From b506335e5d2b4ec687dde392a3bdbf7601778f1d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 1 May 2017 12:15:07 -0700 Subject: md/raid10: skip spare disk as 'first' disk Commit 6f287ca(md/raid10: reset the 'first' at the end of loop) ignores a case in reshape, the first rdev could be a spare disk, which shouldn't be accounted as the first disk since it doesn't include the offset info. Fix: 6f287ca(md/raid10: reset the 'first' at the end of loop) Cc: Guoqing Jiang Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2883b720a265..cce23be9cc93 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4079,8 +4079,8 @@ static int raid10_start_reshape(struct mddev *mddev) diff = 0; if (first || diff < min_offset_diff) min_offset_diff = diff; + first = 0; } - first = 0; } if (max(before_length, after_length) > min_offset_diff) -- cgit v1.2.3