From a40687ff73a5b14909d6aa522f7d778b158911c5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 13 Aug 2014 09:48:45 +1000 Subject: md/raid5: avoid livelock caused by non-aligned writes. If a stripe in a raid6 array received a write to each data block while the array is degraded, and if any of these writes to a missing device are not page-aligned, then a live-lock happens. In this case the P and Q blocks need to be read so that the part of the missing block which is *not* being updated by the write can be constructed. Due to a logic error, these blocks are not loaded, so the update cannot proceed and the stripe is 'handled' repeatedly in an infinite loop. This bug is unlikely as most writes are page aligned. However as it can lead to a livelock it is suitable for -stable. It was introduced in 3.16. Cc: stable@vger.kernel.org (v3.16) Fixed: 67f455486d2ea20b2d94d6adf5b9b783d079e321 Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6234b2e84587..6b2d615d1094 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2922,7 +2922,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || (sh->raid_conf->level == 6 && s->failed && s->to_write && - s->to_write < sh->raid_conf->raid_disks - 2 && + s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 && (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync -- cgit v1.2.3 From 9c4bdf697c39805078392d5ddbbba5ae5680e0dd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 13 Aug 2014 09:57:07 +1000 Subject: md/raid6: avoid data corruption during recovery of double-degraded RAID6 During recovery of a double-degraded RAID6 it is possible for some blocks not to be recovered properly, leading to corruption. If a write happens to one block in a stripe that would be written to a missing device, and at the same time that stripe is recovering data to the other missing device, then that recovered data may not be written. This patch skips, in the double-degraded case, an optimisation that is only safe for single-degraded arrays. Bug was introduced in 2.6.32 and fix is suitable for any kernel since then. In an older kernel with separate handle_stripe5() and handle_stripe6() functions the patch must change handle_stripe6(). Cc: stable@vger.kernel.org (2.6.32+) Fixes: 6c0069c0ae9659e3a91b68eaed06a5c6c37f45c8 Cc: Yuri Tikhonov Cc: Dan Williams Reported-by: "Manibalan P" Tested-by: "Manibalan P" Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1090423 Signed-off-by: NeilBrown Acked-by: Dan Williams --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6b2d615d1094..183588b11fc1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3817,6 +3817,8 @@ static void handle_stripe(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); if (prexor) continue; + if (s.failed > 1) + continue; if (!test_bit(R5_Insync, &dev->flags) || ((i == sh->pd_idx || i == sh->qd_idx) && s.failed == 0)) -- cgit v1.2.3 From ce0b0a46955d1bb389684a2605dbcaa990ba0154 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Aug 2014 13:56:38 +1000 Subject: md/raid10: fix memory leak when reshaping a RAID10. raid10 reshape clears unwanted bits from a bio->bi_flags using a method which, while clumsy, worked until 3.10 when BIO_OWNS_VEC was added. Since then it clears that bit but shouldn't. This results in a memory leak. So change to used the approved method of clearing unwanted bits. As this causes a memory leak which can consume all of memory the fix is suitable for -stable. Fixes: a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd Cc: stable@vger.kernel.org (v3.10+) Reported-by: mdraid.pkoch@dfgh.net (Peter Koch) Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b08c18871323..d9073a10f2f2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4410,7 +4410,7 @@ read_more: read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_sync_read; read_bio->bi_rw = READ; - read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); + read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); read_bio->bi_flags |= 1 << BIO_UPTODATE; read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; -- cgit v1.2.3 From b39685526f46976bcd13aa08c82480092befa46c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Aug 2014 13:59:50 +1000 Subject: md/raid10: Fix memory leak when raid10 reshape completes. When a raid10 commences a resync/recovery/reshape it allocates some buffer space. When a resync/recovery completes the buffer space is freed. But not when the reshape completes. This can result in a small memory leak. There is a subtle side-effect of this bug. When a RAID10 is reshaped to a larger array (more devices), the reshape is immediately followed by a "resync" of the new space. This "resync" will use the buffer space which was allocated for "reshape". This can cause problems including a "BUG" in the SCSI layer. So this is suitable for -stable. Cc: stable@vger.kernel.org (v3.5+) Fixes: 3ea7daa5d7fde47cd41f4d56c2deb949114da9d6 Signed-off-by: NeilBrown --- drivers/md/raid10.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d9073a10f2f2..a46124ecafc7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2953,6 +2953,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); + close_sync(conf); return 0; } -- cgit v1.2.3 From e337aead3aa127f083e64ad678a9e89defefcec5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Aug 2014 14:48:54 +1000 Subject: md/raid10: avoid memory leak on error path during reshape. If raid10 reshape fails to find somewhere to read a block from, it returns without freeing memory... Signed-off-by: NeilBrown --- drivers/md/raid10.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a46124ecafc7..e5037e2aadd3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4399,6 +4399,7 @@ read_more: * on all the target devices. */ // FIXME + mempool_free(r10_bio, conf->r10buf_pool); set_bit(MD_RECOVERY_INTR, &mddev->recovery); return sectors_done; } -- cgit v1.2.3 From cb8b12b5d81cf8522076f99a90bc3b795825c3b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Aug 2014 14:38:45 +1000 Subject: md/raid10: always initialise ->state on newly allocated r10_bio Most places which allocate an r10_bio zero the ->state, some don't. As the r10_bio comes from a mempool, and the allocation function uses kzalloc it is often zero anyway. But sometimes it isn't and it is best to be safe. I only noticed this because of the bug fixed by an earlier patch where the r10_bios allocated for a reshape were left around to be used by a subsequent resync. In that case the R10BIO_IsReshape flag caused problems. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e5037e2aadd3..6703751d87d7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3082,6 +3082,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); @@ -3270,6 +3271,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (sync_blocks < max_sync) max_sync = sync_blocks; r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; r10_bio->mddev = mddev; atomic_set(&r10_bio->remaining, 0); @@ -4385,6 +4387,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_more: /* Now schedule reads for blocks from sector_nr to last */ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; raise_barrier(conf, sectors_done != 0); atomic_set(&r10_bio->remaining, 0); r10_bio->mddev = mddev; -- cgit v1.2.3