summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-07-04 04:48:38 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-07-04 04:48:38 +0300
commite50df24979fd02f920aa7baada714a58bc61bfd9 (patch)
treeb0f12af3e2cb40e1ce0f3d2c4736e6bbec30ffb8 /drivers/md
parent4f52875366bfbd6ddc19c1045b603d853e0a889c (diff)
parent3c2f765c81be1c85782ba09f492800a99f765a2c (diff)
downloadlinux-e50df24979fd02f920aa7baada714a58bc61bfd9.tar.xz
Merge tag 'block-6.5-2023-07-03' of git://git.kernel.dk/linux
Pull more block updates from Jens Axboe: "Mostly items that came in a bit late for the initial pull request, wanted to make sure they had the appropriate amount of linux-next soak before going upstream. Outside of stragglers, just generic fixes for either merge window items, or longer standing bugs" * tag 'block-6.5-2023-07-03' of git://git.kernel.dk/linux: (25 commits) md/raid0: add discard support for the 'original' layout nvme: disable controller on reset state failure nvme: sync timeout work on failed reset nvme: ensure unquiesce on teardown cdrom/gdrom: Fix build error nvme: improved uring polling block: add request polling helper nvme-mpath: fix I/O failure with EAGAIN when failing over I/O nvme: host: fix command name spelling blk-sysfs: add a new attr_group for blk_mq blk-iocost: move wbt_enable/disable_default() out of spinlock blk-wbt: cleanup rwb_enabled() and wbt_disabled() blk-wbt: remove dead code to handle wbt enable/disable with io inflight blk-wbt: don't create wbt sysfs entry if CONFIG_BLK_WBT is disabled blk-mq: fix two misuses on RQF_USE_SCHED blk-throttle: Fix io statistics for cgroup v1 bcache: Fix bcache device claiming bcache: Alloc holder object before async registration raid10: avoid spin_lock from fastpath from raid10_unplug() md: fix 'delete_mutex' deadlock ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/super.c123
-rw-r--r--drivers/md/md.c32
-rw-r--r--drivers/md/md.h4
-rw-r--r--drivers/md/raid0.c62
-rw-r--r--drivers/md/raid0.h1
-rw-r--r--drivers/md/raid1-10.c2
-rw-r--r--drivers/md/raid10.c6
7 files changed, 130 insertions, 100 deletions
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e2a803683105..0ae2b3676293 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl)
put_page(virt_to_page(dc->sb_disk));
if (!IS_ERR_OR_NULL(dc->bdev))
- blkdev_put(dc->bdev, bcache_kobj);
+ blkdev_put(dc->bdev, dc);
wake_up(&unregister_wait);
@@ -1453,7 +1453,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
dc->bdev = bdev;
- dc->bdev->bd_holder = dc;
dc->sb_disk = sb_disk;
if (cached_dev_init(dc, sb->block_size << 9))
@@ -2218,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj)
put_page(virt_to_page(ca->sb_disk));
if (!IS_ERR_OR_NULL(ca->bdev))
- blkdev_put(ca->bdev, bcache_kobj);
+ blkdev_put(ca->bdev, ca);
kfree(ca);
module_put(THIS_MODULE);
@@ -2345,7 +2344,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
- ca->bdev->bd_holder = ca;
ca->sb_disk = sb_disk;
if (bdev_max_discard_sectors((bdev)))
@@ -2359,7 +2357,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
* call blkdev_put() to bdev in bch_cache_release(). So we
* explicitly call blkdev_put() here.
*/
- blkdev_put(bdev, bcache_kobj);
+ blkdev_put(bdev, ca);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else if (ret == -EPERM)
@@ -2448,6 +2446,7 @@ struct async_reg_args {
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
struct block_device *bdev;
+ void *holder;
};
static void register_bdev_worker(struct work_struct *work)
@@ -2455,22 +2454,13 @@ static void register_bdev_worker(struct work_struct *work)
int fail = false;
struct async_reg_args *args =
container_of(work, struct async_reg_args, reg_work.work);
- struct cached_dev *dc;
-
- dc = kzalloc(sizeof(*dc), GFP_KERNEL);
- if (!dc) {
- fail = true;
- put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, bcache_kobj);
- goto out;
- }
mutex_lock(&bch_register_lock);
- if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
+ if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder)
+ < 0)
fail = true;
mutex_unlock(&bch_register_lock);
-out:
if (fail)
pr_info("error %s: fail to register backing device\n",
args->path);
@@ -2485,21 +2475,11 @@ static void register_cache_worker(struct work_struct *work)
int fail = false;
struct async_reg_args *args =
container_of(work, struct async_reg_args, reg_work.work);
- struct cache *ca;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca) {
- fail = true;
- put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, bcache_kobj);
- goto out;
- }
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
+ if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder))
fail = true;
-out:
if (fail)
pr_info("error %s: fail to register cache device\n",
args->path);
@@ -2520,6 +2500,13 @@ static void register_device_async(struct async_reg_args *args)
queue_delayed_work(system_wq, &args->reg_work, 10);
}
+static void *alloc_holder_object(struct cache_sb *sb)
+{
+ if (SB_IS_BDEV(sb))
+ return kzalloc(sizeof(struct cached_dev), GFP_KERNEL);
+ return kzalloc(sizeof(struct cache), GFP_KERNEL);
+}
+
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
@@ -2527,9 +2514,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
char *path = NULL;
struct cache_sb *sb;
struct cache_sb_disk *sb_disk;
- struct block_device *bdev;
+ struct block_device *bdev, *bdev2;
+ void *holder = NULL;
ssize_t ret;
bool async_registration = false;
+ bool quiet = false;
#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
async_registration = true;
@@ -2558,10 +2547,34 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
ret = -EINVAL;
err = "failed to open device";
- bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE,
- bcache_kobj, NULL);
+ bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(bdev))
+ goto out_free_sb;
+
+ err = "failed to set blocksize";
+ if (set_blocksize(bdev, 4096))
+ goto out_blkdev_put;
+
+ err = read_super(sb, bdev, &sb_disk);
+ if (err)
+ goto out_blkdev_put;
+
+ holder = alloc_holder_object(sb);
+ if (!holder) {
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
+ goto out_put_sb_page;
+ }
+
+ /* Now reopen in exclusive mode with proper holder */
+ bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ holder, NULL);
+ blkdev_put(bdev, NULL);
+ bdev = bdev2;
if (IS_ERR(bdev)) {
- if (bdev == ERR_PTR(-EBUSY)) {
+ ret = PTR_ERR(bdev);
+ bdev = NULL;
+ if (ret == -EBUSY) {
dev_t dev;
mutex_lock(&bch_register_lock);
@@ -2571,20 +2584,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
- if (attr == &ksysfs_register_quiet)
- goto done;
+ if (attr == &ksysfs_register_quiet) {
+ quiet = true;
+ ret = size;
+ }
}
- goto out_free_sb;
+ goto out_free_holder;
}
- err = "failed to set blocksize";
- if (set_blocksize(bdev, 4096))
- goto out_blkdev_put;
-
- err = read_super(sb, bdev, &sb_disk);
- if (err)
- goto out_blkdev_put;
-
err = "failed to register device";
if (async_registration) {
@@ -2595,59 +2602,46 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!args) {
ret = -ENOMEM;
err = "cannot allocate memory";
- goto out_put_sb_page;
+ goto out_free_holder;
}
args->path = path;
args->sb = sb;
args->sb_disk = sb_disk;
args->bdev = bdev;
+ args->holder = holder;
register_device_async(args);
/* No wait and returns to user space */
goto async_done;
}
if (SB_IS_BDEV(sb)) {
- struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
-
- if (!dc) {
- ret = -ENOMEM;
- err = "cannot allocate memory";
- goto out_put_sb_page;
- }
-
mutex_lock(&bch_register_lock);
- ret = register_bdev(sb, sb_disk, bdev, dc);
+ ret = register_bdev(sb, sb_disk, bdev, holder);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
if (ret < 0)
goto out_free_sb;
} else {
- struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-
- if (!ca) {
- ret = -ENOMEM;
- err = "cannot allocate memory";
- goto out_put_sb_page;
- }
-
/* blkdev_put() will be called in bch_cache_release() */
- ret = register_cache(sb, sb_disk, bdev, ca);
+ ret = register_cache(sb, sb_disk, bdev, holder);
if (ret)
goto out_free_sb;
}
-done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
async_done:
return size;
+out_free_holder:
+ kfree(holder);
out_put_sb_page:
put_page(virt_to_page(sb_disk));
out_blkdev_put:
- blkdev_put(bdev, register_bcache);
+ if (bdev)
+ blkdev_put(bdev, holder);
out_free_sb:
kfree(sb);
out_free_path:
@@ -2656,7 +2650,8 @@ out_free_path:
out_module_put:
module_put(THIS_MODULE);
out:
- pr_info("error %s: %s\n", path?path:"", err);
+ if (!quiet)
+ pr_info("error %s: %s\n", path?path:"", err);
return ret;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf3733c90c47..2e38ef421d69 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -643,7 +643,6 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
- mutex_init(&mddev->delete_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -749,26 +748,15 @@ static void mddev_free(struct mddev *mddev)
static const struct attribute_group md_redundancy_group;
-static void md_free_rdev(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
{
struct md_rdev *rdev;
struct md_rdev *tmp;
+ LIST_HEAD(delete);
- mutex_lock(&mddev->delete_mutex);
- if (list_empty(&mddev->deleting))
- goto out;
-
- list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
- list_del_init(&rdev->same_set);
- kobject_del(&rdev->kobj);
- export_rdev(rdev, mddev);
- }
-out:
- mutex_unlock(&mddev->delete_mutex);
-}
+ if (!list_empty(&mddev->deleting))
+ list_splice_init(&mddev->deleting, &delete);
-void mddev_unlock(struct mddev *mddev)
-{
if (mddev->to_remove) {
/* These cannot be removed under reconfig_mutex as
* an access to the files will try to take reconfig_mutex
@@ -808,7 +796,11 @@ void mddev_unlock(struct mddev *mddev)
} else
mutex_unlock(&mddev->reconfig_mutex);
- md_free_rdev(mddev);
+ list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
+ list_del_init(&rdev->same_set);
+ kobject_del(&rdev->kobj);
+ export_rdev(rdev, mddev);
+ }
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
@@ -2458,7 +2450,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
- blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev);
+ blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev);
rdev->bdev = NULL;
kobject_put(&rdev->kobj);
}
@@ -2488,9 +2480,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
* reconfig_mutex is held, hence it can't be called under
* reconfig_mutex and it's delayed to mddev_unlock().
*/
- mutex_lock(&mddev->delete_mutex);
list_add(&rdev->same_set, &mddev->deleting);
- mutex_unlock(&mddev->delete_mutex);
}
static void export_array(struct mddev *mddev)
@@ -6140,7 +6130,7 @@ static void md_clean(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
- mddev->external = 0;
+ /* we still need mddev->external in export_rdev, do not clear it yet */
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bfd2306bc750..1aef86bf3fc3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -531,11 +531,9 @@ struct mddev {
/*
* Temporarily store rdev that will be finally removed when
- * reconfig_mutex is unlocked.
+ * reconfig_mutex is unlocked, protected by reconfig_mutex.
*/
struct list_head deleting;
- /* Protect the deleting list */
- struct mutex delete_mutex;
bool has_superblocks:1;
bool fail_last_dev:1;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8ee9a95e25d..d1ac73fcd852 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -270,6 +270,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
goto abort;
}
+ if (conf->layout == RAID0_ORIG_LAYOUT) {
+ for (i = 1; i < conf->nr_strip_zones; i++) {
+ sector_t first_sector = conf->strip_zone[i-1].zone_end;
+
+ sector_div(first_sector, mddev->chunk_sectors);
+ zone = conf->strip_zone + i;
+ /* disk_shift is first disk index used in the zone */
+ zone->disk_shift = sector_div(first_sector,
+ zone->nb_dev);
+ }
+ }
+
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
@@ -431,6 +443,20 @@ exit_acct_set:
return ret;
}
+/*
+ * Convert disk_index to the disk order in which it is read/written.
+ * For example, if we have 4 disks, they are numbered 0,1,2,3. If we
+ * write the disks starting at disk 3, then the read/write order would
+ * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift()
+ * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map
+ * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in
+ * that 'output' space to understand the read/write disk ordering.
+ */
+static int map_disk_shift(int disk_index, int num_disks, int disk_shift)
+{
+ return ((disk_index + num_disks - disk_shift) % num_disks);
+}
+
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
@@ -444,7 +470,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int disk;
+ sector_t orig_start, orig_end;
+ orig_start = start;
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
@@ -458,6 +486,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
} else
end = bio_end_sector(bio);
+ orig_end = end;
if (zone != conf->strip_zone)
end = end - zone[-1].zone_end;
@@ -469,13 +498,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
last_stripe_index = end;
sector_div(last_stripe_index, stripe_size);
- start_disk_index = (int)(start - first_stripe_index * stripe_size) /
- mddev->chunk_sectors;
+ /* In the first zone the original and alternate layouts are the same */
+ if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) {
+ sector_div(orig_start, mddev->chunk_sectors);
+ start_disk_index = sector_div(orig_start, zone->nb_dev);
+ start_disk_index = map_disk_shift(start_disk_index,
+ zone->nb_dev,
+ zone->disk_shift);
+ sector_div(orig_end, mddev->chunk_sectors);
+ end_disk_index = sector_div(orig_end, zone->nb_dev);
+ end_disk_index = map_disk_shift(end_disk_index,
+ zone->nb_dev, zone->disk_shift);
+ } else {
+ start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ }
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
mddev->chunk_sectors) +
first_stripe_index * mddev->chunk_sectors;
- end_disk_index = (int)(end - last_stripe_index * stripe_size) /
- mddev->chunk_sectors;
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
mddev->chunk_sectors) +
last_stripe_index * mddev->chunk_sectors;
@@ -483,18 +525,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
struct md_rdev *rdev;
+ int compare_disk;
+
+ compare_disk = map_disk_shift(disk, zone->nb_dev,
+ zone->disk_shift);
- if (disk < start_disk_index)
+ if (compare_disk < start_disk_index)
dev_start = (first_stripe_index + 1) *
mddev->chunk_sectors;
- else if (disk > start_disk_index)
+ else if (compare_disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;
- if (disk < end_disk_index)
+ if (compare_disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
- else if (disk > end_disk_index)
+ else if (compare_disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 3816e5477db1..8cc761ca7423 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -6,6 +6,7 @@ struct strip_zone {
sector_t zone_end; /* Start of the next zone (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
int nb_dev; /* # of devices attached to the zone */
+ int disk_shift; /* start disk for the original layout */
};
/* Linux 3.14 (20d0189b101) made an unintended change to
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 169ebe296f2d..3f22edec70e7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -116,7 +116,7 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
static inline void raid1_submit_write(struct bio *bio)
{
- struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
+ struct md_rdev *rdev = (void *)bio->bi_bdev;
bio->bi_next = NULL;
bio_set_dev(bio, rdev->bdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d0de8c9fb3cf..5051149e27bb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,7 +325,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ if (r10_bio->start_time)
bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio);
/*
@@ -1118,7 +1118,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1127,7 +1127,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
raid1_prepare_flush_writes(mddev->bitmap);
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;