summaryrefslogtreecommitdiff
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c164
1 files changed, 69 insertions, 95 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f80f1af61ce7..4517f06c41ba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -255,9 +255,10 @@ static void call_bio_endio(struct r1bio *r1_bio)
done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = -EIO;
+
if (done) {
- bio_endio(bio, 0);
+ bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -312,9 +313,9 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
return mirror;
}
-static void raid1_end_read_request(struct bio *bio, int error)
+static void raid1_end_read_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct r1bio *r1_bio = bio->bi_private;
int mirror;
struct r1conf *conf = r1_bio->mddev->private;
@@ -336,7 +337,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
spin_lock_irqsave(&conf->device_lock, flags);
if (r1_bio->mddev->degraded == conf->raid_disks ||
(r1_bio->mddev->degraded == conf->raid_disks-1 &&
- !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ test_bit(In_sync, &conf->mirrors[mirror].rdev->flags)))
uptodate = 1;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
@@ -397,9 +398,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
}
}
-static void raid1_end_write_request(struct bio *bio, int error)
+static void raid1_end_write_request(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r1bio *r1_bio = bio->bi_private;
int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
struct r1conf *conf = r1_bio->mddev->private;
@@ -410,7 +410,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
/*
* 'one mirror IO has finished' event handler:
*/
- if (!uptodate) {
+ if (bio->bi_error) {
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
if (!test_and_set_bit(WantReplacement,
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+ md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
@@ -557,7 +557,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
- || test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
@@ -708,38 +707,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct r1conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max = biovec->bv_len;
-
- if (mddev->merge_check_needed) {
- int disk;
- rcu_read_lock();
- for (disk = 0; disk < conf->raid_disks * 2; disk++) {
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[disk].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = sector +
- rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- }
- rcu_read_unlock();
- }
- return max;
-
-}
-
static int raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@@ -793,7 +760,7 @@ static void flush_pending_writes(struct r1conf *conf)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -1068,7 +1035,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
if (unlikely((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
/* Just ignore it */
- bio_endio(bio, 0);
+ bio_endio(bio);
else
generic_make_request(bio);
bio = next;
@@ -1111,7 +1078,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+ md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -1124,7 +1092,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
- !md_cluster_ops->area_resyncing(mddev,
+ !md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))
break;
schedule();
@@ -1157,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
* non-zero, then it is the number of not-completed requests.
*/
bio->bi_phys_segments = 0;
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
if (rw == READ) {
/*
@@ -1268,8 +1236,7 @@ read_again:
break;
}
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)
- || test_bit(Unmerged, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
@@ -1475,6 +1442,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
+ unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
@@ -1494,19 +1462,19 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
return;
}
set_bit(Blocked, &rdev->flags);
+ spin_lock_irqsave(&conf->device_lock, flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
- spin_unlock_irqrestore(&conf->device_lock, flags);
} else
set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
@@ -1567,7 +1535,10 @@ static int raid1_spare_active(struct mddev *mddev)
* Find all failed disks within the RAID1 configuration
* and mark them readable.
* Called under mddev lock, so rcu protection not needed.
+ * device_lock used to avoid races with raid1_end_read_request
+ * which expects 'In_sync' flags and ->degraded to be consistent.
*/
+ spin_lock_irqsave(&conf->device_lock, flags);
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
@@ -1598,7 +1569,6 @@ static int raid1_spare_active(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1614,7 +1584,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
- struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -1622,11 +1591,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- if (q->merge_bvec_fn) {
- set_bit(Unmerged, &rdev->flags);
- mddev->merge_check_needed = 1;
- }
-
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
@@ -1658,19 +1622,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
- if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
- /* Some requests might not have seen this new
- * merge_bvec_fn. We must wait for them to complete
- * before merging the device fully.
- * First we make sure any code which has tested
- * our function has submitted the request, then
- * we wait for all outstanding requests to complete.
- */
- synchronize_sched();
- freeze_array(conf, 0);
- unfreeze_array(conf);
- clear_bit(Unmerged, &rdev->flags);
- }
md_integrity_add_rdev(rdev, mddev);
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@ -1734,7 +1685,7 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio)
{
struct r1bio *r1_bio = bio->bi_private;
@@ -1745,16 +1696,16 @@ static void end_sync_read(struct bio *bio, int error)
* or re-read if the read failed.
* We don't do much here, just schedule handling by raid1d
*/
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!bio->bi_error)
set_bit(R1BIO_Uptodate, &r1_bio->state);
if (atomic_dec_and_test(&r1_bio->remaining))
reschedule_retry(r1_bio);
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct r1bio *r1_bio = bio->bi_private;
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -1941,7 +1892,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
idx ++;
}
set_bit(R1BIO_Uptodate, &r1_bio->state);
- set_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio->bi_error = 0;
return 1;
}
@@ -1965,15 +1916,14 @@ static void process_checks(struct r1bio *r1_bio)
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int size;
- int uptodate;
+ int error;
struct bio *b = r1_bio->bios[i];
if (b->bi_end_io != end_sync_read)
continue;
- /* fixup the bio for reuse, but preserve BIO_UPTODATE */
- uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+ /* fixup the bio for reuse, but preserve errno */
+ error = b->bi_error;
bio_reset(b);
- if (!uptodate)
- clear_bit(BIO_UPTODATE, &b->bi_flags);
+ b->bi_error = error;
b->bi_vcnt = vcnt;
b->bi_iter.bi_size = r1_bio->sectors << 9;
b->bi_iter.bi_sector = r1_bio->sector +
@@ -1996,7 +1946,7 @@ static void process_checks(struct r1bio *r1_bio)
}
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ !r1_bio->bios[primary]->bi_error) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
@@ -2006,14 +1956,14 @@ static void process_checks(struct r1bio *r1_bio)
int j;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
+ int error = sbio->bi_error;
if (sbio->bi_end_io != end_sync_read)
continue;
- /* Now we can 'fixup' the BIO_UPTODATE flag */
- set_bit(BIO_UPTODATE, &sbio->bi_flags);
+ /* Now we can 'fixup' the error value */
+ sbio->bi_error = 0;
- if (uptodate) {
+ if (!error) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
@@ -2028,7 +1978,7 @@ static void process_checks(struct r1bio *r1_bio)
if (j >= 0)
atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && uptodate)) {
+ && !error)) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2269,11 +2219,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
continue;
- if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ if (!bio->bi_error &&
test_bit(R1BIO_MadeGood, &r1_bio->state)) {
rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
}
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ if (bio->bi_error &&
test_bit(R1BIO_WriteError, &r1_bio->state)) {
if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
md_error(conf->mddev, rdev);
@@ -2286,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
+ bool fail = false;
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2298,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* narrow down and record precise write
* errors.
*/
+ fail = true;
if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev,
conf->mirrors[m].rdev);
@@ -2309,7 +2261,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
}
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
- raid_end_bio_io(r1_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ raid_end_bio_io(r1_bio);
}
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2415,6 +2373,23 @@ static void raid1d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ list_add(&tmp, &conf->bio_end_io_list);
+ list_del_init(&conf->bio_end_io_list);
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r1_bio = list_first_entry(&conf->bio_end_io_list,
+ struct r1bio, retry_list);
+ list_del(&r1_bio->retry_list);
+ raid_end_bio_io(r1_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
@@ -2712,7 +2687,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
/* remove last page from this bio */
bio->bi_vcnt--;
bio->bi_iter.bi_size -= len;
- __clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
}
goto bio_full;
}
@@ -2807,8 +2782,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
goto abort;
disk->rdev = rdev;
q = bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn)
- mddev->merge_check_needed = 1;
disk->head_position = 0;
disk->seq_start = MaxSector;
@@ -2816,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -3110,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev)
unfreeze_array(conf);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -3173,7 +3148,6 @@ static struct md_personality raid1_personality =
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
.congested = raid1_congested,
- .mergeable_bvec = raid1_mergeable_bvec,
};
static int __init raid_init(void)