summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 00:39:37 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 00:39:37 +0300
commitfc0586062816559defb14c947319ef8c4c326fb3 (patch)
tree5ca73bd1fc9de596a11e6d3549fd8fbf6f87dafc /drivers/md
parent6c0029211382011af508273c4fc98a732f841d95 (diff)
parent8324fbae75ce65fc2eb960a8434799dca48248ac (diff)
downloadlinux-fc0586062816559defb14c947319ef8c4c326fb3.tar.xz
Merge tag 'for-5.13/drivers-2021-04-27' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: - MD changes via Song: - raid5 POWER fix - raid1 failure fix - UAF fix for md cluster - mddev_find_or_alloc() clean up - Fix NULL pointer deref with external bitmap - Performance improvement for raid10 discard requests - Fix missing information of /proc/mdstat - rsxx const qualifier removal (Arnd) - Expose allocated brd pages (Calvin) - rnbd via Gioh Kim: - Change maintainer - Change domain address of maintainers' email - Add polling IO mode and document update - Fix memory leak and some bug detected by static code analysis tools - Code refactoring - Series of floppy cleanups/fixes (Denis) - s390 dasd fixes (Julian) - kerneldoc fixes (Lee) - null_blk double free (Lv) - null_blk virtual boundary addition (Max) - Remove xsysace driver (Michal) - umem driver removal (Davidlohr) - ataflop fixes (Dan) - Revalidate disk removal (Christoph) - Bounce buffer cleanups (Christoph) - Mark lightnvm as deprecated (Christoph) - mtip32xx init cleanups (Shixin) - Various fixes (Tian, Gustavo, Coly, Yang, Zhang, Zhiqiang) * tag 'for-5.13/drivers-2021-04-27' of git://git.kernel.dk/linux-block: (143 commits) async_xor: increase src_offs when dropping destination page drivers/block/null_blk/main: Fix a double free in null_init. md/raid1: properly indicate failure when ending a failed write request md-cluster: fix use-after-free issue when removing rdev nvme: introduce generic per-namespace chardev nvme: cleanup nvme_configure_apst nvme: do not try to reconfigure APST when the controller is not live nvme: add 'kato' sysfs attribute nvme: sanitize KATO setting nvmet: avoid queuing keep-alive timer if it is disabled brd: expose number of allocated pages in debugfs ataflop: fix off by one in ataflop_probe() ataflop: potential out of bounds in do_format() drbd: Fix fall-through warnings for Clang block/rnbd: Use strscpy instead of strlcpy block/rnbd-clt-sysfs: Remove copy buffer overlap in rnbd_clt_get_path_name block/rnbd-clt: Remove max_segment_size block/rnbd-clt: Generate kobject_uevent when the rnbd device state changes block/rnbd-srv: Remove unused arguments of rnbd_srv_rdma_ev Documentation/ABI/rnbd-clt: Add description for nr_poll_queues ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/bcache.h11
-rw-r--r--drivers/md/bcache/btree.c4
-rw-r--r--drivers/md/bcache/debug.c2
-rw-r--r--drivers/md/bcache/extents.c4
-rw-r--r--drivers/md/bcache/features.c2
-rw-r--r--drivers/md/bcache/io.c4
-rw-r--r--drivers/md/bcache/journal.c6
-rw-r--r--drivers/md/bcache/super.c25
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c11
-rw-r--r--drivers/md/md-bitmap.c2
-rw-r--r--drivers/md/md.c206
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/raid0.c14
-rw-r--r--drivers/md/raid1.c2
-rw-r--r--drivers/md/raid10.c434
-rw-r--r--drivers/md/raid10.h1
18 files changed, 541 insertions, 196 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 8c371d5eef8e..097577ae3c47 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -482,8 +482,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
unsigned int i;
for (i = 0; i < KEY_PTRS(k); i++)
- __bch_bucket_free(PTR_CACHE(c, k, i),
- PTR_BUCKET(c, k, i));
+ __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
@@ -674,7 +673,7 @@ bool bch_alloc_sectors(struct cache_set *c,
SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
atomic_long_add(sectors,
- &PTR_CACHE(c, &b->key, i)->sectors_written);
+ &c->cache->sectors_written);
}
if (b->sectors_free < c->cache->sb.block_size)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 848dd4db1659..0a4551e165ab 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -804,13 +804,6 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
return s & (c->cache->sb.bucket_size - 1);
}
-static inline struct cache *PTR_CACHE(struct cache_set *c,
- const struct bkey *k,
- unsigned int ptr)
-{
- return c->cache;
-}
-
static inline size_t PTR_BUCKET_NR(struct cache_set *c,
const struct bkey *k,
unsigned int ptr)
@@ -822,7 +815,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
const struct bkey *k,
unsigned int ptr)
{
- return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+ return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr);
}
static inline uint8_t gen_after(uint8_t a, uint8_t b)
@@ -841,7 +834,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
unsigned int i)
{
- return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+ return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache;
}
/* Btree key macros */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fe6dce125aba..183a58c89377 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -426,7 +426,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
do_btree_node_write(b);
atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
- &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+ &b->c->cache->btree_sectors_written);
b->written += set_blocks(i, block_bytes(b->c->cache));
}
@@ -1161,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
for (i = 0; i < KEY_PTRS(k); i++)
SET_PTR_GEN(k, i,
- bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+ bch_inc_gen(b->c->cache,
PTR_BUCKET(b->c, &b->key, i)));
mutex_unlock(&b->c->bucket_lock);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 63e809f38e3f..116edda845c3 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b)
v->keys.ops = b->keys.ops;
bio = bch_bbio_alloc(b->c);
- bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev);
+ bio_set_dev(bio, b->c->cache->bdev);
bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
bio->bi_opf = REQ_OP_READ | REQ_META;
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index f4658a1f37b8..d626ffcbecb9 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -50,7 +50,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
- struct cache *ca = PTR_CACHE(c, k, i);
+ struct cache *ca = c->cache;
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
@@ -71,7 +71,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
- struct cache *ca = PTR_CACHE(c, k, i);
+ struct cache *ca = c->cache;
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
index d636b7b2d070..6d2b7b84a7b7 100644
--- a/drivers/md/bcache/features.c
+++ b/drivers/md/bcache/features.c
@@ -19,7 +19,7 @@ struct feature {
static struct feature feature_list[] = {
{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE,
"large_bucket"},
- {0, 0, 0 },
+ {0, 0, NULL },
};
#define compose_feature_string(type) \
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index dad71a6b7889..e4388fe3ab7e 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -36,7 +36,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
struct bbio *b = container_of(bio, struct bbio, bio);
bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
- bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
+ bio_set_dev(bio, c->cache->bdev);
b->submit_time_us = local_clock_us();
closure_bio_submit(c, bio, bio->bi_private);
@@ -137,7 +137,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
blk_status_t error, const char *m)
{
struct bbio *b = container_of(bio, struct bbio, bio);
- struct cache *ca = PTR_CACHE(c, &b->key, 0);
+ struct cache *ca = c->cache;
int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
unsigned int threshold = op_is_write(bio_op(bio))
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c6613e817333..61bd79babf7a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -111,7 +111,7 @@ reread: left = ca->sb.bucket_size - offset;
* Check from the oldest jset for last_seq. If
* i->j.seq < j->last_seq, it means the oldest jset
* in list is expired and useless, remove it from
- * this list. Otherwise, j is a condidate jset for
+ * this list. Otherwise, j is a candidate jset for
* further following checks.
*/
while (!list_empty(list)) {
@@ -498,7 +498,7 @@ static void btree_flush_write(struct cache_set *c)
* - If there are matched nodes recorded in btree_nodes[],
* they are clean now (this is why and how the oldest
* journal entry can be reclaimed). These selected nodes
- * will be ignored and skipped in the folowing for-loop.
+ * will be ignored and skipped in the following for-loop.
*/
if (((btree_current_write(b)->journal - fifo_front_p) &
mask) != 0) {
@@ -768,7 +768,7 @@ static void journal_write_unlocked(struct closure *cl)
w->data->csum = csum_set(w->data);
for (i = 0; i < KEY_PTRS(k); i++) {
- ca = PTR_CACHE(c, k, i);
+ ca = c->cache;
bio = &ca->journal.bio;
atomic_long_add(sectors, &ca->meta_sectors_written);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 03e1fe4de53d..2b6d6e9cd680 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1052,6 +1052,7 @@ static int cached_dev_status_update(void *arg)
int bch_cached_dev_run(struct cached_dev *dc)
{
+ int ret = 0;
struct bcache_device *d = &dc->disk;
char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
char *env[] = {
@@ -1064,19 +1065,15 @@ int bch_cached_dev_run(struct cached_dev *dc)
if (dc->io_disable) {
pr_err("I/O disabled on cached dev %s\n",
dc->backing_dev_name);
- kfree(env[1]);
- kfree(env[2]);
- kfree(buf);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
if (atomic_xchg(&dc->running, 1)) {
- kfree(env[1]);
- kfree(env[2]);
- kfree(buf);
pr_info("cached dev %s is running already\n",
dc->backing_dev_name);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out;
}
if (!d->c &&
@@ -1097,15 +1094,13 @@ int bch_cached_dev_run(struct cached_dev *dc)
* only class / kset properties are persistent
*/
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
- kfree(env[1]);
- kfree(env[2]);
- kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj,
&d->kobj, "bcache")) {
pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
dc->status_update_thread = kthread_run(cached_dev_status_update,
@@ -1114,7 +1109,11 @@ int bch_cached_dev_run(struct cached_dev *dc)
pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
}
- return 0;
+out:
+ kfree(env[1]);
+ kfree(env[2]);
+ kfree(buf);
+ return ret;
}
/*
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index c029f7443190..bca4a7c97da7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -27,7 +27,7 @@ struct closure;
#else /* DEBUG */
-#define EBUG_ON(cond) do { if (cond); } while (0)
+#define EBUG_ON(cond) do { if (cond) do {} while (0); } while (0)
#define atomic_dec_bug(v) atomic_dec(v)
#define atomic_inc_bug(v, i) atomic_inc(v)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 82d4e0880a99..8120da278161 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -110,13 +110,13 @@ static void __update_writeback_rate(struct cached_dev *dc)
int64_t fps;
if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
- fp_term = dc->writeback_rate_fp_term_low *
+ fp_term = (int64_t)dc->writeback_rate_fp_term_low *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
} else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
- fp_term = dc->writeback_rate_fp_term_mid *
+ fp_term = (int64_t)dc->writeback_rate_fp_term_mid *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
} else {
- fp_term = dc->writeback_rate_fp_term_high *
+ fp_term = (int64_t)dc->writeback_rate_fp_term_high *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
}
fps = div_s64(dirty, dirty_buckets) * fp_term;
@@ -416,7 +416,7 @@ static void read_dirty_endio(struct bio *bio)
struct dirty_io *io = w->private;
/* is_read = 1 */
- bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+ bch_count_io_errors(io->dc->disk.c->cache,
bio->bi_status, 1,
"reading dirty data from cache");
@@ -510,8 +510,7 @@ static void read_dirty(struct cached_dev *dc)
dirty_init(w);
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
- bio_set_dev(&io->bio,
- PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+ bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
io->bio.bi_end_io = read_dirty_endio;
if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 200c5d0f08bf..ea3130e11680 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1722,6 +1722,8 @@ void md_bitmap_flush(struct mddev *mddev)
md_bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
md_bitmap_daemon_work(mddev);
+ if (mddev->bitmap_info.external)
+ md_super_wait(mddev);
md_bitmap_update_sb(bitmap);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21da0c48f6c2..49f897fbb89b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -734,78 +734,94 @@ void mddev_init(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_init);
+static struct mddev *mddev_find_locked(dev_t unit)
+{
+ struct mddev *mddev;
+
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+ if (mddev->unit == unit)
+ return mddev;
+
+ return NULL;
+}
+
+/* find an unused unit number */
+static dev_t mddev_alloc_unit(void)
+{
+ static int next_minor = 512;
+ int start = next_minor;
+ bool is_free = 0;
+ dev_t dev = 0;
+
+ while (!is_free) {
+ dev = MKDEV(MD_MAJOR, next_minor);
+ next_minor++;
+ if (next_minor > MINORMASK)
+ next_minor = 0;
+ if (next_minor == start)
+ return 0; /* Oh dear, all in use. */
+ is_free = !mddev_find_locked(dev);
+ }
+
+ return dev;
+}
+
static struct mddev *mddev_find(dev_t unit)
{
- struct mddev *mddev, *new = NULL;
+ struct mddev *mddev;
- if (unit && MAJOR(unit) != MD_MAJOR)
- unit &= ~((1<<MdpMinorShift)-1);
+ if (MAJOR(unit) != MD_MAJOR)
+ unit &= ~((1 << MdpMinorShift) - 1);
- retry:
spin_lock(&all_mddevs_lock);
+ mddev = mddev_find_locked(unit);
+ if (mddev)
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
- if (unit) {
- list_for_each_entry(mddev, &all_mddevs, all_mddevs)
- if (mddev->unit == unit) {
- mddev_get(mddev);
- spin_unlock(&all_mddevs_lock);
- kfree(new);
- return mddev;
- }
+ return mddev;
+}
- if (new) {
- list_add(&new->all_mddevs, &all_mddevs);
- spin_unlock(&all_mddevs_lock);
- new->hold_active = UNTIL_IOCTL;
- return new;
- }
- } else if (new) {
- /* find an unused unit number */
- static int next_minor = 512;
- int start = next_minor;
- int is_free = 0;
- int dev = 0;
- while (!is_free) {
- dev = MKDEV(MD_MAJOR, next_minor);
- next_minor++;
- if (next_minor > MINORMASK)
- next_minor = 0;
- if (next_minor == start) {
- /* Oh dear, all in use. */
- spin_unlock(&all_mddevs_lock);
- kfree(new);
- return NULL;
- }
+static struct mddev *mddev_alloc(dev_t unit)
+{
+ struct mddev *new;
+ int error;
- is_free = 1;
- list_for_each_entry(mddev, &all_mddevs, all_mddevs)
- if (mddev->unit == dev) {
- is_free = 0;
- break;
- }
- }
- new->unit = dev;
- new->md_minor = MINOR(dev);
- new->hold_active = UNTIL_STOP;
- list_add(&new->all_mddevs, &all_mddevs);
- spin_unlock(&all_mddevs_lock);
- return new;
- }
- spin_unlock(&all_mddevs_lock);
+ if (unit && MAJOR(unit) != MD_MAJOR)
+ unit &= ~((1 << MdpMinorShift) - 1);
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
- return NULL;
-
- new->unit = unit;
- if (MAJOR(unit) == MD_MAJOR)
- new->md_minor = MINOR(unit);
- else
- new->md_minor = MINOR(unit) >> MdpMinorShift;
-
+ return ERR_PTR(-ENOMEM);
mddev_init(new);
- goto retry;
+ spin_lock(&all_mddevs_lock);
+ if (unit) {
+ error = -EEXIST;
+ if (mddev_find_locked(unit))
+ goto out_free_new;
+ new->unit = unit;
+ if (MAJOR(unit) == MD_MAJOR)
+ new->md_minor = MINOR(unit);
+ else
+ new->md_minor = MINOR(unit) >> MdpMinorShift;
+ new->hold_active = UNTIL_IOCTL;
+ } else {
+ error = -ENODEV;
+ new->unit = mddev_alloc_unit();
+ if (!new->unit)
+ goto out_free_new;
+ new->md_minor = MINOR(new->unit);
+ new->hold_active = UNTIL_STOP;
+ }
+
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ return new;
+out_free_new:
+ spin_unlock(&all_mddevs_lock);
+ kfree(new);
+ return ERR_PTR(error);
}
static struct attribute_group md_redundancy_group;
@@ -5644,29 +5660,29 @@ static int md_alloc(dev_t dev, char *name)
* writing to /sys/module/md_mod/parameters/new_array.
*/
static DEFINE_MUTEX(disks_mutex);
- struct mddev *mddev = mddev_find(dev);
+ struct mddev *mddev;
struct gendisk *disk;
int partitioned;
int shift;
int unit;
- int error;
+ int error ;
- if (!mddev)
- return -ENODEV;
-
- partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
- shift = partitioned ? MdpMinorShift : 0;
- unit = MINOR(mddev->unit) >> shift;
-
- /* wait for any previous instance of this device to be
- * completely removed (mddev_delayed_delete).
+ /*
+ * Wait for any previous instance of this device to be completely
+ * removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
mutex_lock(&disks_mutex);
- error = -EEXIST;
- if (mddev->gendisk)
- goto abort;
+ mddev = mddev_alloc(dev);
+ if (IS_ERR(mddev)) {
+ mutex_unlock(&disks_mutex);
+ return PTR_ERR(mddev);
+ }
+
+ partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
+ shift = partitioned ? MdpMinorShift : 0;
+ unit = MINOR(mddev->unit) >> shift;
if (name && !dev) {
/* Need to ensure that 'name' is not a duplicate.
@@ -5678,6 +5694,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev2->gendisk &&
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
+ error = -EEXIST;
goto abort;
}
spin_unlock(&all_mddevs_lock);
@@ -6524,11 +6541,9 @@ static void autorun_devices(int part)
md_probe(dev);
mddev = mddev_find(dev);
- if (!mddev || !mddev->gendisk) {
- if (mddev)
- mddev_put(mddev);
+ if (!mddev)
break;
- }
+
if (mddev_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
@@ -7821,8 +7836,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
/* Wait until bdev->bd_disk is definitely gone */
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
- /* Then retry the open from the top */
- return -ERESTARTSYS;
+ return -EBUSY;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -8153,7 +8167,11 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
loff_t l = *pos;
struct mddev *mddev;
- if (l >= 0x10000)
+ if (l == 0x10000) {
+ ++*pos;
+ return (void *)2;
+ }
+ if (l > 0x10000)
return NULL;
if (!l--)
/* header */
@@ -8575,6 +8593,26 @@ void md_write_end(struct mddev *mddev)
EXPORT_SYMBOL(md_write_end);
+/* This is used by raid0 and raid10 */
+void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
+ struct bio *bio, sector_t start, sector_t size)
+{
+ struct bio *discard_bio = NULL;
+
+ if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0,
+ &discard_bio) || !discard_bio)
+ return;
+
+ bio_chain(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio);
+ if (mddev->gendisk)
+ trace_block_bio_remap(discard_bio,
+ disk_devt(mddev->gendisk),
+ bio->bi_iter.bi_sector);
+ submit_bio_noacct(discard_bio);
+}
+EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
@@ -9251,11 +9289,11 @@ void md_check_recovery(struct mddev *mddev)
}
if (mddev_is_clustered(mddev)) {
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *tmp;
/* kick the device if another node issued a
* remove disk.
*/
- rdev_for_each(rdev, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
rdev->raid_disk < 0)
md_kick_rdev_from_array(rdev);
@@ -9569,7 +9607,7 @@ err_wq:
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- struct md_rdev *rdev2;
+ struct md_rdev *rdev2, *tmp;
int role, ret;
char b[BDEVNAME_SIZE];
@@ -9586,7 +9624,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
}
/* Check for change of roles in the active devices */
- rdev_for_each(rdev2, mddev) {
+ rdev_for_each_safe(rdev2, tmp, mddev) {
if (test_bit(Faulty, &rdev2->flags))
continue;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bcbba1b5ec4a..fb7eab58cfd5 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
+void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
+ struct bio *bio, sector_t start, sector_t size);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 67f157f2525d..e5d7411cba9b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -477,7 +477,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
- struct bio *discard_bio = NULL;
struct md_rdev *rdev;
if (disk < start_disk_index)
@@ -500,18 +499,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
rdev = conf->devlist[(zone - conf->strip_zone) *
conf->strip_zone[0].nb_dev + disk];
- if (__blkdev_issue_discard(rdev->bdev,
+ md_submit_discard_bio(mddev, rdev, bio,
dev_start + zone->dev_start + rdev->data_offset,
- dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
- !discard_bio)
- continue;
- bio_chain(discard_bio, bio);
- bio_clone_blkg_association(discard_bio, bio);
- if (mddev->gendisk)
- trace_block_bio_remap(discard_bio,
- disk_devt(mddev->gendisk),
- bio->bi_iter.bi_sector);
- submit_bio_noacct(discard_bio);
+ dev_end - dev_start);
}
bio_endio(bio);
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d2378765dc15..ced076ba560e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -478,6 +478,8 @@ static void raid1_end_write_request(struct bio *bio)
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
else {
+ /* Fail the request */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
/* Finished with this branch */
r1_bio->bios[mirror] = NULL;
to_put = bio;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a9ae7d113492..13f5e6b2a73d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
- int size = offsetof(struct r10bio, devs[conf->copies]);
+ int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
/* allocate a r10bio with room for raid_disks entries in the
* bios array */
@@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
{
int i;
- for (i = 0; i < conf->copies; i++) {
+ for (i = 0; i < conf->geo.raid_disks; i++) {
struct bio **bio = & r10_bio->devs[i].bio;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
@@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
int slot;
int repl = 0;
- for (slot = 0; slot < conf->copies; slot++) {
+ for (slot = 0; slot < conf->geo.raid_disks; slot++) {
if (r10_bio->devs[slot].bio == bio)
break;
if (r10_bio->devs[slot].repl_bio == bio) {
@@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
}
}
- BUG_ON(slot == conf->copies);
update_head_pos(slot, r10_bio);
if (slotp)
@@ -1274,12 +1273,77 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
}
}
+static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
+{
+ int i;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *blocked_rdev;
+
+retry_wait:
+ blocked_rdev = NULL;
+ rcu_read_lock();
+ for (i = 0; i < conf->copies; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[i].replacement);
+ if (rdev == rrdev)
+ rrdev = NULL;
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
+
+ if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ sector_t dev_sector = r10_bio->devs[i].addr;
+ int bad_sectors;
+ int is_bad;
+
+ /*
+ * Discard request doesn't care the write result
+ * so it doesn't need to wait blocked disk here.
+ */
+ if (!r10_bio->sectors)
+ continue;
+
+ is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /*
+ * Mustn't write here until the bad block
+ * is acknowledged
+ */
+ atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ if (unlikely(blocked_rdev)) {
+ /* Have to wait for this device to get unblocked, then retry */
+ allow_barrier(conf);
+ raid10_log(conf->mddev, "%s wait rdev %d blocked",
+ __func__, blocked_rdev->raid_disk);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_wait;
+ }
+}
+
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int i;
- struct md_rdev *blocked_rdev;
sector_t sectors;
int max_sectors;
@@ -1337,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
raid10_find_phys(conf, r10_bio);
-retry_write:
- blocked_rdev = NULL;
+
+ wait_blocked_dev(mddev, r10_bio);
+
rcu_read_lock();
max_sectors = r10_bio->sectors;
@@ -1349,16 +1414,6 @@ retry_write:
conf->mirrors[d].replacement);
if (rdev == rrdev)
rrdev = NULL;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
- if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
- atomic_inc(&rrdev->nr_pending);
- blocked_rdev = rrdev;
- break;
- }
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1379,15 +1434,6 @@ retry_write:
is_bad = is_badblock(rdev, dev_sector, max_sectors,
&first_bad, &bad_sectors);
- if (is_bad < 0) {
- /* Mustn't write here until the bad block
- * is acknowledged
- */
- atomic_inc(&rdev->nr_pending);
- set_bit(BlockedBadBlocks, &rdev->flags);
- blocked_rdev = rdev;
- break;
- }
if (is_bad && first_bad <= dev_sector) {
/* Cannot write here at all */
bad_sectors -= (dev_sector - first_bad);
@@ -1423,35 +1469,6 @@ retry_write:
}
rcu_read_unlock();
- if (unlikely(blocked_rdev)) {
- /* Have to wait for this device to get unblocked, then retry */
- int j;
- int d;
-
- for (j = 0; j < i; j++) {
- if (r10_bio->devs[j].bio) {
- d = r10_bio->devs[j].devnum;
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
- }
- if (r10_bio->devs[j].repl_bio) {
- struct md_rdev *rdev;
- d = r10_bio->devs[j].devnum;
- rdev = conf->mirrors[d].replacement;
- if (!rdev) {
- /* Race with remove_disk */
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
- rdev_dec_pending(rdev, mddev);
- }
- }
- allow_barrier(conf);
- raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
- md_wait_for_blocked_rdev(blocked_rdev, mddev);
- wait_barrier(conf);
- goto retry_write;
- }
-
if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
@@ -1492,7 +1509,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
- memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
+ conf->geo.raid_disks);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1500,6 +1518,304 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
raid10_write_request(mddev, bio, r10_bio);
}
+static void raid_end_discard_bio(struct r10bio *r10bio)
+{
+ struct r10conf *conf = r10bio->mddev->private;
+ struct r10bio *first_r10bio;
+
+ while (atomic_dec_and_test(&r10bio->remaining)) {
+
+ allow_barrier(conf);
+
+ if (!test_bit(R10BIO_Discard, &r10bio->state)) {
+ first_r10bio = (struct r10bio *)r10bio->master_bio;
+ free_r10bio(r10bio);
+ r10bio = first_r10bio;
+ } else {
+ md_write_end(r10bio->mddev);
+ bio_endio(r10bio->master_bio);
+ free_r10bio(r10bio);
+ break;
+ }
+ }
+}
+
+static void raid10_end_discard_request(struct bio *bio)
+{
+ struct r10bio *r10_bio = bio->bi_private;
+ struct r10conf *conf = r10_bio->mddev->private;
+ struct md_rdev *rdev = NULL;
+ int dev;
+ int slot, repl;
+
+ /*
+ * We don't care the return value of discard bio
+ */
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[dev].replacement;
+ if (!rdev) {
+ /*
+ * raid10_remove_disk uses smp_mb to make sure rdev is set to
+ * replacement before setting replacement to NULL. It can read
+ * rdev first without barrier protect even replacment is NULL
+ */
+ smp_rmb();
+ rdev = conf->mirrors[dev].rdev;
+ }
+
+ raid_end_discard_bio(r10_bio);
+ rdev_dec_pending(rdev, conf->mddev);
+}
+
+/*
+ * There are some limitations to handle discard bio
+ * 1st, the discard size is bigger than stripe_size*2.
+ * 2st, if the discard bio spans reshape progress, we use the old way to
+ * handle discard bio
+ */
+static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+ struct r10conf *conf = mddev->private;
+ struct geom *geo = &conf->geo;
+ int far_copies = geo->far_copies;
+ bool first_copy = true;
+ struct r10bio *r10_bio, *first_r10bio;
+ struct bio *split;
+ int disk;
+ sector_t chunk;
+ unsigned int stripe_size;
+ unsigned int stripe_data_disks;
+ sector_t split_size;
+ sector_t bio_start, bio_end;
+ sector_t first_stripe_index, last_stripe_index;
+ sector_t start_disk_offset;
+ unsigned int start_disk_index;
+ sector_t end_disk_offset;
+ unsigned int end_disk_index;
+ unsigned int remainder;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return -EAGAIN;
+
+ wait_barrier(conf);
+
+ /*
+ * Check reshape again to avoid reshape happens after checking
+ * MD_RECOVERY_RESHAPE and before wait_barrier
+ */
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ goto out;
+
+ if (geo->near_copies)
+ stripe_data_disks = geo->raid_disks / geo->near_copies +
+ geo->raid_disks % geo->near_copies;
+ else
+ stripe_data_disks = geo->raid_disks;
+
+ stripe_size = stripe_data_disks << geo->chunk_shift;
+
+ bio_start = bio->bi_iter.bi_sector;
+ bio_end = bio_end_sector(bio);
+
+ /*
+ * Maybe one discard bio is smaller than strip size or across one
+ * stripe and discard region is larger than one stripe size. For far
+ * offset layout, if the discard region is not aligned with stripe
+ * size, there is hole when we submit discard bio to member disk.
+ * For simplicity, we only handle discard bio which discard region
+ * is bigger than stripe_size * 2
+ */
+ if (bio_sectors(bio) < stripe_size*2)
+ goto out;
+
+ /*
+ * Keep bio aligned with strip size.
+ */
+ div_u64_rem(bio_start, stripe_size, &remainder);
+ if (remainder) {
+ split_size = stripe_size - remainder;
+ split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ bio_chain(split, bio);
+ allow_barrier(conf);
+ /* Resend the fist split part */
+ submit_bio_noacct(split);
+ wait_barrier(conf);
+ }
+ div_u64_rem(bio_end, stripe_size, &remainder);
+ if (remainder) {
+ split_size = bio_sectors(bio) - remainder;
+ split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ bio_chain(split, bio);
+ allow_barrier(conf);
+ /* Resend the second split part */
+ submit_bio_noacct(bio);
+ bio = split;
+ wait_barrier(conf);
+ }
+
+ bio_start = bio->bi_iter.bi_sector;
+ bio_end = bio_end_sector(bio);
+
+ /*
+ * Raid10 uses chunk as the unit to store data. It's similar like raid0.
+ * One stripe contains the chunks from all member disk (one chunk from
+ * one disk at the same HBA address). For layout detail, see 'man md 4'
+ */
+ chunk = bio_start >> geo->chunk_shift;
+ chunk *= geo->near_copies;
+ first_stripe_index = chunk;
+ start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
+ if (geo->far_offset)
+ first_stripe_index *= geo->far_copies;
+ start_disk_offset = (bio_start & geo->chunk_mask) +
+ (first_stripe_index << geo->chunk_shift);
+
+ chunk = bio_end >> geo->chunk_shift;
+ chunk *= geo->near_copies;
+ last_stripe_index = chunk;
+ end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
+ if (geo->far_offset)
+ last_stripe_index *= geo->far_copies;
+ end_disk_offset = (bio_end & geo->chunk_mask) +
+ (last_stripe_index << geo->chunk_shift);
+
+retry_discard:
+ r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio->mddev = mddev;
+ r10_bio->state = 0;
+ r10_bio->sectors = 0;
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
+ wait_blocked_dev(mddev, r10_bio);
+
+ /*
+ * For far layout it needs more than one r10bio to cover all regions.
+ * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
+ * to record the discard bio. Other r10bio->master_bio record the first
+ * r10bio. The first r10bio only release after all other r10bios finish.
+ * The discard bio returns only first r10bio finishes
+ */
+ if (first_copy) {
+ r10_bio->master_bio = bio;
+ set_bit(R10BIO_Discard, &r10_bio->state);
+ first_copy = false;
+ first_r10bio = r10_bio;
+ } else
+ r10_bio->master_bio = (struct bio *)first_r10bio;
+
+ rcu_read_lock();
+ for (disk = 0; disk < geo->raid_disks; disk++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[disk].replacement);
+
+ r10_bio->devs[disk].bio = NULL;
+ r10_bio->devs[disk].repl_bio = NULL;
+
+ if (rdev && (test_bit(Faulty, &rdev->flags)))
+ rdev = NULL;
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)))
+ rrdev = NULL;
+ if (!rdev && !rrdev)
+ continue;
+
+ if (rdev) {
+ r10_bio->devs[disk].bio = bio;
+ atomic_inc(&rdev->nr_pending);
+ }
+ if (rrdev) {
+ r10_bio->devs[disk].repl_bio = bio;
+ atomic_inc(&rrdev->nr_pending);
+ }
+ }
+ rcu_read_unlock();
+
+ atomic_set(&r10_bio->remaining, 1);
+ for (disk = 0; disk < geo->raid_disks; disk++) {
+ sector_t dev_start, dev_end;
+ struct bio *mbio, *rbio = NULL;
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[disk].replacement);
+
+ /*
+ * Now start to calculate the start and end address for each disk.
+ * The space between dev_start and dev_end is the discard region.
+ *
+ * For dev_start, it needs to consider three conditions:
+ * 1st, the disk is before start_disk, you can imagine the disk in
+ * the next stripe. So the dev_start is the start address of next
+ * stripe.
+ * 2st, the disk is after start_disk, it means the disk is at the
+ * same stripe of first disk
+ * 3st, the first disk itself, we can use start_disk_offset directly
+ */
+ if (disk < start_disk_index)
+ dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > start_disk_index)
+ dev_start = first_stripe_index * mddev->chunk_sectors;
+ else
+ dev_start = start_disk_offset;
+
+ if (disk < end_disk_index)
+ dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > end_disk_index)
+ dev_end = last_stripe_index * mddev->chunk_sectors;
+ else
+ dev_end = end_disk_offset;
+
+ /*
+ * It only handles discard bio which size is >= stripe size, so
+ * dev_end > dev_start all the time
+ */
+ if (r10_bio->devs[disk].bio) {
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ mbio->bi_end_io = raid10_end_discard_request;
+ mbio->bi_private = r10_bio;
+ r10_bio->devs[disk].bio = mbio;
+ r10_bio->devs[disk].devnum = disk;
+ atomic_inc(&r10_bio->remaining);
+ md_submit_discard_bio(mddev, rdev, mbio,
+ dev_start + choose_data_offset(r10_bio, rdev),
+ dev_end - dev_start);
+ bio_endio(mbio);
+ }
+ if (r10_bio->devs[disk].repl_bio) {
+ rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ rbio->bi_end_io = raid10_end_discard_request;
+ rbio->bi_private = r10_bio;
+ r10_bio->devs[disk].repl_bio = rbio;
+ r10_bio->devs[disk].devnum = disk;
+ atomic_inc(&r10_bio->remaining);
+ md_submit_discard_bio(mddev, rrdev, rbio,
+ dev_start + choose_data_offset(r10_bio, rrdev),
+ dev_end - dev_start);
+ bio_endio(rbio);
+ }
+ }
+
+ if (!geo->far_offset && --far_copies) {
+ first_stripe_index += geo->stride >> geo->chunk_shift;
+ start_disk_offset += geo->stride;
+ last_stripe_index += geo->stride >> geo->chunk_shift;
+ end_disk_offset += geo->stride;
+ atomic_inc(&first_r10bio->remaining);
+ raid_end_discard_bio(r10_bio);
+ wait_barrier(conf);
+ goto retry_discard;
+ }
+
+ raid_end_discard_bio(r10_bio);
+
+ return 0;
+out:
+ allow_barrier(conf);
+ return -EAGAIN;
+}
+
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
@@ -1514,6 +1830,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
if (!md_write_start(mddev, bio))
return false;
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+ if (!raid10_handle_discard(mddev, bio))
+ return true;
+
/*
* If this request crosses a chunk boundary, we need to split
* it.
@@ -3753,7 +4073,7 @@ static int raid10_run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
- mddev->chunk_sectors);
+ UINT_MAX);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 79cd2b7d3128..1461fd55311b 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -179,5 +179,6 @@ enum r10bio_state {
R10BIO_Previous,
/* failfast devices did receive failfast requests. */
R10BIO_FailFast,
+ R10BIO_Discard,
};
#endif