From d0dd594bedc57f9be2af2af170bf56f9c3f2376e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 15 Jun 2023 02:49:03 -0700 Subject: nvme: Print capabilities changes just once This current dev_info() could be very verbose and being printed very frequently depending on some userspace application sending some specific commands. Just print this message once and skip it until the controller resets. Use a controller flag (NVME_CTRL_DIRTY_CAPABILITY) to track if the capability needs a reset. Signed-off-by: Breno Leitao Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/nvme.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 76e8f8b4098e..4b7f9edab5e8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1125,8 +1125,11 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) { - dev_info(ctrl->device, + if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY, + &ctrl->flags)) { + dev_info(ctrl->device, "controller capabilities changed, reset may be required to take effect.\n"); + } } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); @@ -3280,6 +3283,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) return ret; } + clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags); ctrl->identified = true; return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 78308f15e090..f3182134487a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -243,6 +243,7 @@ enum nvme_ctrl_flags { NVME_CTRL_STARTED_ONCE = 2, NVME_CTRL_STOPPED = 3, NVME_CTRL_SKIP_ID_CNS_CS = 4, + NVME_CTRL_DIRTY_CAPABILITY = 5, }; struct nvme_ctrl { -- cgit v1.2.3 From 9d16d264775b9a10f3f5b5db768d7f51294b2a63 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 27 Apr 2023 21:47:45 +0200 Subject: nvmet: Reorder fields in 'struct nvmet_ns' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvmet_ns' from 520 to 512 bytes. When such a structure is allocated in nvmet_ns_alloc(), because of the way memory allocation works, when 520 bytes were requested, 1024 bytes were allocated. So, on x86_64, this change saves 512 bytes per allocation. Signed-off-by: Christophe JAILLET Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Jens Axboe Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6cf723bc664e..8cfd60f3b564 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -79,8 +79,8 @@ struct nvmet_ns { struct completion disable_done; mempool_t *bvec_pool; - int use_p2pmem; struct pci_dev *p2p_dev; + int use_p2pmem; int pi_type; int metadata_size; u8 csi; -- cgit v1.2.3 From 125bfc7cd750e68c99f1d446e2c22abea08c237f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 9 Jun 2023 17:43:20 +0800 Subject: md/raid10: fix the condition to call bio_end_io_acct() /sys/block/[device]/queue/iostats is used to control whether to count io stat. Write 0 to it will clear queue_flags QUEUE_FLAG_IO_STAT which means iostats is disabled. If we disable iostats and later endable it, the io issued during this period will be counted incorrectly, inflight will be decreased to -1. //T1 set iostats echo 0 > /sys/block/md0/queue/iostats clear QUEUE_FLAG_IO_STAT //T2 issue io if (QUEUE_FLAG_IO_STAT) -> false bio_start_io_acct inflight++ echo 1 > /sys/block/md0/queue/iostats set QUEUE_FLAG_IO_STAT //T3 io end if (QUEUE_FLAG_IO_STAT) -> true bio_end_io_acct inflight-- -> -1 Also, if iostats is enabled while issuing io but disabled while io end, inflight will never be decreased. Fix it by checking start_time when io end. If start_time is not 0, call bio_end_io_acct(). Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230609094320.2397604-1-linan666@huaweicloud.com --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d0de8c9fb3cf..79067769e44b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,7 +325,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + if (r10_bio->start_time) bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* -- cgit v1.2.3 From b5a99602b74bbfa655be509c615181dd95b0719e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Jun 2023 09:21:36 +0800 Subject: md/raid1-10: fix casting from randomized structure in raid1_submit_write() Following build error triggered while build with clang version 17.0.0 with W=1(this can't be reporduced with gcc 13.1.0): drivers/md/raid1-10.c:117:25: error: casting from randomized structure pointer type 'struct block_device *' to 'struct md_rdev *' 117 | struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; | ^ Fix this by casting 'bio->bi_bdev' to 'void *', as it used to be. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306142042.fmjfmTF8-lkp@intel.com/ Fixes: 8295efbe68c0 ("md/raid1-10: factor out a helper to submit normal write") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230616012136.3047071-1-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 169ebe296f2d..3f22edec70e7 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -116,7 +116,7 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, static inline void raid1_submit_write(struct bio *bio) { - struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + struct md_rdev *rdev = (void *)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); -- cgit v1.2.3 From a1d7671910965ca9f8f0377e7e3bfd1179fba4d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 16 Jun 2023 22:24:04 -0700 Subject: md: use mddev->external to select holder in export_rdev() mdadm test "10ddf-create-fail-rebuild" triggers warnings like the following [ 215.526357] ------------[ cut here ]------------ [ 215.527243] WARNING: CPU: 18 PID: 1264 at block/bdev.c:617 blkdev_put+0x269/0x350 [ 215.528334] Modules linked in: [ 215.528806] CPU: 18 PID: 1264 Comm: mdmon Not tainted 6.4.0-rc2+ #768 [ 215.529863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS [ 215.531464] RIP: 0010:blkdev_put+0x269/0x350 [ 215.532167] Code: ff ff 49 8d 7d 10 e8 56 bf b8 ff 4d 8b 65 10 49 8d bc 24 58 05 00 00 e8 05 be b8 ff 41 83 ac 24 58 05 00 00 01 e9 44 ff ff ff <0f> 0b e9 52 fe ff ff 0f 0b e9 6b fe ff ff1 [ 215.534780] RSP: 0018:ffffc900040bfbf0 EFLAGS: 00010283 [ 215.535635] RAX: ffff888174001000 RBX: ffff88810b1c3b00 RCX: ffffffff819a4061 [ 215.536645] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88810b1c3ba0 [ 215.537657] RBP: ffff88810dbde800 R08: fffffbfff0fca983 R09: fffffbfff0fca983 [ 215.538674] R10: ffffc900040bfbf0 R11: fffffbfff0fca982 R12: ffff88810b1c3b38 [ 215.539687] R13: ffff88810b1c3b10 R14: ffff88810dbdecb8 R15: ffff88810b1c3b00 [ 215.540833] FS: 00007f2aabdff700(0000) GS:ffff888dfb400000(0000) knlGS:0000000000000000 [ 215.541961] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 215.542775] CR2: 00007fa19a85d934 CR3: 000000010c076006 CR4: 0000000000370ee0 [ 215.543814] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 215.544840] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 215.545885] Call Trace: [ 215.546257] [ 215.546608] export_rdev.isra.63+0x71/0xe0 [ 215.547338] mddev_unlock+0x1b1/0x2d0 [ 215.547898] array_state_store+0x28d/0x450 [ 215.548519] md_attr_store+0xd7/0x150 [ 215.549059] ? __pfx_sysfs_kf_write+0x10/0x10 [ 215.549702] kernfs_fop_write_iter+0x1b9/0x260 [ 215.550351] vfs_write+0x491/0x760 [ 215.550863] ? __pfx_vfs_write+0x10/0x10 [ 215.551445] ? __fget_files+0x156/0x230 [ 215.552053] ksys_write+0xc0/0x160 [ 215.552570] ? __pfx_ksys_write+0x10/0x10 [ 215.553141] ? ktime_get_coarse_real_ts64+0xec/0x100 [ 215.553878] do_syscall_64+0x3a/0x90 [ 215.554403] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 215.555125] RIP: 0033:0x7f2aade11847 [ 215.555696] Code: c3 66 90 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 1b fd ff ff 4c 89 e2 48 89 ee 89 df 41 89 c0 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 448 [ 215.558398] RSP: 002b:00007f2aabdfeba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [ 215.559516] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f2aade11847 [ 215.560515] RDX: 0000000000000005 RSI: 0000000000438b8b RDI: 0000000000000010 [ 215.561512] RBP: 0000000000438b8b R08: 0000000000000000 R09: 00007f2aaecf0060 [ 215.562511] R10: 000000000e3ba40b R11: 0000000000000293 R12: 0000000000000005 [ 215.563647] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000c70750 [ 215.564693] [ 215.565029] irq event stamp: 15979 [ 215.565584] hardirqs last enabled at (15991): [] __up_console_sem+0x52/0x60 [ 215.566806] hardirqs last disabled at (16000): [] __up_console_sem+0x37/0x60 [ 215.568022] softirqs last enabled at (15716): [] __do_softirq+0x3eb/0x531 [ 215.569239] softirqs last disabled at (15711): [] irq_exit_rcu+0x115/0x160 [ 215.570434] ---[ end trace 0000000000000000 ]--- This means export_rdev() calls blkdev_put with a different holder than the one used by blkdev_get_by_dev(). This is because mddev->major_version == -2 is not a good check for external metadata. Fix this by using mddev->external instead. Also, do not clear mddev->external in md_clean(), as the flag might be used later in export_rdev(). Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Song Liu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230617052405.305871-1-song@kernel.org --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index cf3733c90c47..8e7cc2e69bc9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2458,7 +2458,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev); + blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -6140,7 +6140,7 @@ static void md_clean(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; - mddev->external = 0; + /* we still need mddev->external in export_rdev, do not clear it yet */ mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; -- cgit v1.2.3 From 4934b6401a812f9fe368e7d2d091cd1d120ea262 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Jun 2023 22:29:33 +0800 Subject: md: fix 'delete_mutex' deadlock Commit 3ce94ce5d05a ("md: fix duplicate filename for rdev") introduce a new lock 'delete_mutex', and trigger a new deadlock: t1: remove rdev t2: sysfs writer rdev_attr_store rdev_attr_store mddev_lock state_store md_kick_rdev_from_array lock delete_mutex list_add mddev->deleting unlock delete_mutex mddev_unlock mddev_lock ... lock delete_mutex kobject_del // wait for sysfs writers to be done mddev_unlock lock delete_mutex // wait for delete_mutex, deadlock 'delete_mutex' is used to protect the list 'mddev->deleting', turns out that this list can be protected by 'reconfig_mutex' directly, and this lock can be removed. Fix this problem by removing the lock, and use 'reconfig_mutex' to protect the list. mddev_unlock() will move this list to a local list to be handled after 'reconfig_mutex' is dropped. Fixes: 3ce94ce5d05a ("md: fix duplicate filename for rdev") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621142933.1395629-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 28 +++++++++------------------- drivers/md/md.h | 4 +--- 2 files changed, 10 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e7cc2e69bc9..2e38ef421d69 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -643,7 +643,6 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -749,26 +748,15 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; -static void md_free_rdev(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { struct md_rdev *rdev; struct md_rdev *tmp; + LIST_HEAD(delete); - mutex_lock(&mddev->delete_mutex); - if (list_empty(&mddev->deleting)) - goto out; + if (!list_empty(&mddev->deleting)) + list_splice_init(&mddev->deleting, &delete); - list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { - list_del_init(&rdev->same_set); - kobject_del(&rdev->kobj); - export_rdev(rdev, mddev); - } -out: - mutex_unlock(&mddev->delete_mutex); -} - -void mddev_unlock(struct mddev *mddev) -{ if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex @@ -808,7 +796,11 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); - md_free_rdev(mddev); + list_for_each_entry_safe(rdev, tmp, &delete, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); @@ -2488,9 +2480,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock(). */ - mutex_lock(&mddev->delete_mutex); list_add(&rdev->same_set, &mddev->deleting); - mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index bfd2306bc750..1aef86bf3fc3 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -531,11 +531,9 @@ struct mddev { /* * Temporarily store rdev that will be finally removed when - * reconfig_mutex is unlocked. + * reconfig_mutex is unlocked, protected by reconfig_mutex. */ struct list_head deleting; - /* Protect the deleting list */ - struct mutex delete_mutex; bool has_superblocks:1; bool fail_last_dev:1; -- cgit v1.2.3 From a8d5fdd4d2702d0c7ec125bd3bbce3fc589afa67 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Jun 2023 18:57:28 +0800 Subject: raid10: avoid spin_lock from fastpath from raid10_unplug() Commit 0c0be98bbe67 ("md/raid10: prevent unnecessary calls to wake_up() in fast path") missed one place, for example, with: fio -direct=1 -rw=write/randwrite -iodepth=1 ... Plug and unplug are called for each io, then wake_up() from raid10_unplug() will cause lock contention as well. Avoid this contention by using wake_up_barrier() instead of wake_up(), where spin_lock is not held if waitqueue is empty. Fio test script: [global] name=random reads and writes ioengine=libaio direct=1 readwrite=randrw rwmixread=70 iodepth=64 buffered=0 filename=/dev/md0 size=1G runtime=30 time_based randrepeat=0 norandommap refill_buffers ramp_time=10 bs=4k numjobs=400 group_reporting=1 [job1] Test result with ramdisk raid10(By Ali): Before this patch With this patch READ IOPS=2033k IOPS=3642k WRITE IOPS=871k IOPS=1561K By the way, in this scenario, blk_plug_cb() will be allocated and freed for each io, this seems need to be optimized as well. Reported-and-tested-by: Ali Gholami Rudi Closes: https://lore.kernel.org/all/20231606122233@laper.mirepesht/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 79067769e44b..5051149e27bb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1118,7 +1118,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1127,7 +1127,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); raid1_prepare_flush_writes(mddev->bitmap); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; -- cgit v1.2.3 From abcc0cbd49283fccd20420e86416b2475b00819c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2023 18:46:54 +0200 Subject: bcache: Alloc holder object before async registration Allocate holder object (cache or cached_dev) before offloading the rest of the startup to async work. This will allow us to open the block block device with proper holder. Signed-off-by: Jan Kara Acked-by: Coly Li Reviewed-by: Kent Overstreet Link: https://lore.kernel.org/r/20230622164658.12861-1-jack@suse.cz Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 66 ++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 41 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e2a803683105..913dd94353b6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2448,6 +2448,7 @@ struct async_reg_args { struct cache_sb *sb; struct cache_sb_disk *sb_disk; struct block_device *bdev; + void *holder; }; static void register_bdev_worker(struct work_struct *work) @@ -2455,22 +2456,13 @@ static void register_bdev_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cached_dev *dc; - - dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, bcache_kobj); - goto out; - } mutex_lock(&bch_register_lock); - if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) + if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder) + < 0) fail = true; mutex_unlock(&bch_register_lock); -out: if (fail) pr_info("error %s: fail to register backing device\n", args->path); @@ -2485,21 +2477,11 @@ static void register_cache_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cache *ca; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, bcache_kobj); - goto out; - } /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) + if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder)) fail = true; -out: if (fail) pr_info("error %s: fail to register cache device\n", args->path); @@ -2520,6 +2502,13 @@ static void register_device_async(struct async_reg_args *args) queue_delayed_work(system_wq, &args->reg_work, 10); } +static void *alloc_holder_object(struct cache_sb *sb) +{ + if (SB_IS_BDEV(sb)) + return kzalloc(sizeof(struct cached_dev), GFP_KERNEL); + return kzalloc(sizeof(struct cache), GFP_KERNEL); +} + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { @@ -2528,6 +2517,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, struct cache_sb *sb; struct cache_sb_disk *sb_disk; struct block_device *bdev; + void *holder; ssize_t ret; bool async_registration = false; @@ -2585,6 +2575,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (err) goto out_blkdev_put; + holder = alloc_holder_object(sb); + if (!holder) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + err = "failed to register device"; if (async_registration) { @@ -2595,44 +2592,29 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!args) { ret = -ENOMEM; err = "cannot allocate memory"; - goto out_put_sb_page; + goto out_free_holder; } args->path = path; args->sb = sb; args->sb_disk = sb_disk; args->bdev = bdev; + args->holder = holder; register_device_async(args); /* No wait and returns to user space */ goto async_done; } if (SB_IS_BDEV(sb)) { - struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); - - if (!dc) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_disk, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, holder); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto out_free_sb; } else { - struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - - if (!ca) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - /* blkdev_put() will be called in bch_cache_release() */ - ret = register_cache(sb, sb_disk, bdev, ca); + ret = register_cache(sb, sb_disk, bdev, holder); if (ret) goto out_free_sb; } @@ -2644,6 +2626,8 @@ done: async_done: return size; +out_free_holder: + kfree(holder); out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: -- cgit v1.2.3 From 2c5555983bd27d24162534b682b10654639a5576 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2023 18:46:55 +0200 Subject: bcache: Fix bcache device claiming Commit 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") introduced a change that blkdev_put() has to get exclusive holder of the bdev as an argument. However it overlooked that register_bdev() and register_cache() overwrite the bdev->bd_holder field in the block device to point to the real owning object which was not available at the time we called blkdev_get_by_path(). Messing with bdev internals like this is a layering violation and it also causes blkdev_put() to issue warning about mismatching holders. Fix bcache to reopen the block device with appropriate holder once it is available which also restores the behavior that multiple bcache caches cannot claim the same device which was broken by commit 29499ab060fe ("bcache: don't pass a stack address to blkdev_get_by_path"). Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Signed-off-by: Jan Kara Reviewed-by: Kent Overstreet Acked-by: Coly Li Link: https://lore.kernel.org/r/20230622164658.12861-2-jack@suse.cz Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 65 +++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 913dd94353b6..0ae2b3676293 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl) put_page(virt_to_page(dc->sb_disk)); if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, bcache_kobj); + blkdev_put(dc->bdev, dc); wake_up(&unregister_wait); @@ -1453,7 +1453,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; - dc->bdev->bd_holder = dc; dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) @@ -2218,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj) put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, bcache_kobj); + blkdev_put(ca->bdev, ca); kfree(ca); module_put(THIS_MODULE); @@ -2345,7 +2344,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; - ca->bdev->bd_holder = ca; ca->sb_disk = sb_disk; if (bdev_max_discard_sectors((bdev))) @@ -2359,7 +2357,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, * call blkdev_put() to bdev in bch_cache_release(). So we * explicitly call blkdev_put() here. */ - blkdev_put(bdev, bcache_kobj); + blkdev_put(bdev, ca); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) @@ -2516,10 +2514,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, char *path = NULL; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev; - void *holder; + struct block_device *bdev, *bdev2; + void *holder = NULL; ssize_t ret; bool async_registration = false; + bool quiet = false; #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION async_registration = true; @@ -2548,24 +2547,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE, - bcache_kobj, NULL); - if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) { - dev_t dev; - - mutex_lock(&bch_register_lock); - if (lookup_bdev(strim(path), &dev) == 0 && - bch_is_open(dev)) - err = "device already registered"; - else - err = "device busy"; - mutex_unlock(&bch_register_lock); - if (attr == &ksysfs_register_quiet) - goto done; - } + bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(bdev)) goto out_free_sb; - } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) @@ -2582,6 +2566,32 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_put_sb_page; } + /* Now reopen in exclusive mode with proper holder */ + bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + holder, NULL); + blkdev_put(bdev, NULL); + bdev = bdev2; + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + bdev = NULL; + if (ret == -EBUSY) { + dev_t dev; + + mutex_lock(&bch_register_lock); + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) + err = "device already registered"; + else + err = "device busy"; + mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) { + quiet = true; + ret = size; + } + } + goto out_free_holder; + } + err = "failed to register device"; if (async_registration) { @@ -2619,7 +2629,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_free_sb; } -done: kfree(sb); kfree(path); module_put(THIS_MODULE); @@ -2631,7 +2640,8 @@ out_free_holder: out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: - blkdev_put(bdev, register_bcache); + if (bdev) + blkdev_put(bdev, holder); out_free_sb: kfree(sb); out_free_path: @@ -2640,7 +2650,8 @@ out_free_path: out_module_put: module_put(THIS_MODULE); out: - pr_info("error %s: %s\n", path?path:"", err); + if (!quiet) + pr_info("error %s: %s\n", path?path:"", err); return ret; } -- cgit v1.2.3 From 86da1bae4c64ab3dcbdda0c77ce37c9bf47a501f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 22 Jun 2023 16:53:07 +0900 Subject: nvme: host: fix command name spelling Correctly spell "Zeroes" in nvme_cmd_write_zeroes command name. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/constants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index bc523ca02254..44de4f6a4f4c 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -12,7 +12,7 @@ static const char * const nvme_ops[] = { [nvme_cmd_read] = "Read", [nvme_cmd_write_uncor] = "Write Uncorrectable", [nvme_cmd_compare] = "Compare", - [nvme_cmd_write_zeroes] = "Write Zeros", + [nvme_cmd_write_zeroes] = "Write Zeroes", [nvme_cmd_dsm] = "Dataset Management", [nvme_cmd_verify] = "Verify", [nvme_cmd_resv_register] = "Reservation Register", -- cgit v1.2.3 From 99160af413b4ff1c3b4741e8a7583f8e7197f201 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 20 Jun 2023 16:07:36 +0300 Subject: nvme-mpath: fix I/O failure with EAGAIN when failing over I/O It is possible that the next available path we failover to, happens to be frozen (for example if it is during connection establishment). If the original I/O was set with NOWAIT, this cause the I/O to unnecessarily fail because the request queue cannot be entered, hence the I/O fails with EAGAIN. The NOWAIT restriction that was originally set for the I/O is no longer relevant or needed because this is the nvme requeue context. Hence we clear the REQ_NOWAIT flag when failing over I/O. This fix a simple test case of nvme controller reset during I/O when the multipath device that has only a single path and I/O fails with "Resource temporarily unavailable" errno. Note that this reproduces with io_uring which by default sets IOCB_NOWAIT by default. Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 91a9a55227fa..3acb47760e24 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -106,6 +106,14 @@ void nvme_failover_req(struct request *req) bio->bi_opf &= ~REQ_POLLED; bio->bi_cookie = BLK_QC_T_NONE; } + /* + * The alternate request queue that we may end up submitting + * the bio to may be frozen temporarily, in this case REQ_NOWAIT + * will fail the I/O immediately with EAGAIN to the issuer. + * We are not in the issuer context which cannot block. Clear + * the flag to avoid spurious EAGAIN I/O failures. + */ + bio->bi_opf &= ~REQ_NOWAIT; } blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); -- cgit v1.2.3 From 9408d8a37e6cce8803681ab816383450a056c3a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 12 Jun 2023 12:03:43 -0700 Subject: nvme: improved uring polling Drivers can poll requests directly, so use that. We just need to ensure the driver's request was allocated from a polled hctx, so a special driver flag is added to struct io_uring_cmd. The allows unshared and multipath namespaces to use the same polling callback, and multipath is guaranteed to get the same queue as the command was submitted on. Previously multipath polling might check a different path and poll the wrong info. The other bonus is we don't need a bio payload in order to poll, allowing commands like 'flush' and 'write zeroes' to be submitted on the same high priority queue as read and write commands. Finally, using the request based polling skips the unnecessary bio overhead. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230612190343.2087040-3-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 70 ++++++++++++------------------------------- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 2 -- include/uapi/linux/io_uring.h | 2 ++ 4 files changed, 22 insertions(+), 54 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 2130ad65b58c..5c3250f36ce7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -505,7 +505,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -518,10 +517,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } @@ -531,7 +532,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; pdu->req = req; @@ -540,10 +540,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); + } return RQ_END_IO_NONE; } @@ -599,7 +601,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (issue_flags & IO_URING_F_IOPOLL) rq_flags |= REQ_POLLED; -retry: req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); if (IS_ERR(req)) return PTR_ERR(req); @@ -613,17 +614,11 @@ retry: return ret; } - if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { - if (unlikely(!req->bio)) { - /* we can't poll this, so alloc regular req instead */ - blk_mq_free_request(req); - rq_flags &= ~REQ_POLLED; - goto retry; - } else { - WRITE_ONCE(ioucmd->cookie, req->bio); - req->bio->bi_opf |= REQ_POLLED; - } + if (blk_rq_is_poll(req)) { + ioucmd->flags |= IORING_URING_CMD_POLLED; + WRITE_ONCE(ioucmd->cookie, req); } + /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; pdu->meta_len = d.metadata_len; @@ -785,18 +780,16 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags) { - struct bio *bio; + struct request *req; int ret = 0; - struct nvme_ns *ns; - struct request_queue *q; + + if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) + return 0; rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - ns = container_of(file_inode(ioucmd->file)->i_cdev, - struct nvme_ns, cdev); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); + req = READ_ONCE(ioucmd->cookie); + if (req && blk_rq_is_poll(req)) + ret = blk_rq_poll(req, iob, poll_flags); rcu_read_unlock(); return ret; } @@ -890,31 +883,6 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, srcu_read_unlock(&head->srcu, srcu_idx); return ret; } - -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, - unsigned int poll_flags) -{ - struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; - struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); - int srcu_idx = srcu_read_lock(&head->srcu); - struct nvme_ns *ns = nvme_find_path(head); - struct bio *bio; - int ret = 0; - struct request_queue *q; - - if (ns) { - rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio - && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); - rcu_read_unlock(); - } - srcu_read_unlock(&head->srcu, srcu_idx); - return ret; -} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 98001eebd275..5aa1592849af 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -470,7 +470,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, .uring_cmd = nvme_ns_head_chr_uring_cmd, - .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a98c14c552a..791cafd9910a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -854,8 +854,6 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags); -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, unsigned int poll_flags); int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f222d263bc55..08720c7bd92f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -244,8 +244,10 @@ enum io_uring_op { * sqe->uring_cmd_flags * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_POLLED driver use only */ #define IORING_URING_CMD_FIXED (1U << 0) +#define IORING_URING_CMD_POLLED (1U << 31) /* -- cgit v1.2.3 From a587b046ce921cc1805de6f0f000209b3644cadd Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 29 Jun 2023 01:30:47 -0700 Subject: cdrom/gdrom: Fix build error Commit 7ae24fcee992 ("cdrom: remove the unused mode argument to cdrom_release") was supposed to remove an unused argument from cdrom_release(). but instead removed a used argument from cdrom_open(). This results in the following build error. drivers/cdrom/gdrom.c: In function 'gdrom_bdops_open': drivers/cdrom/gdrom.c:484:15: error: too few arguments to function 'cdrom_open' drivers/cdrom/gdrom.c: In function 'gdrom_bdops_release': drivers/cdrom/gdrom.c:492:35: error: 'mode' undeclared Fix it up. Fixes: 7ae24fcee992 ("cdrom: remove the unused mode argument to cdrom_release") Cc: Christoph Hellwig Cc: Phillip Potter Cc: Hannes Reinecke Cc: Christian Brauner Cc: Jens Axboe Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20230629083047.3487172-1-linux@roeck-us.net Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 3a46e27479ff..d668b174ace9 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode) disk_check_media_change(disk); mutex_lock(&gdrom_mutex); - ret = cdrom_open(gd.cd_info); + ret = cdrom_open(gd.cd_info, mode); mutex_unlock(&gdrom_mutex); return ret; } @@ -489,7 +489,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode) static void gdrom_bdops_release(struct gendisk *disk) { mutex_lock(&gdrom_mutex); - cdrom_release(gd.cd_info, mode); + cdrom_release(gd.cd_info); mutex_unlock(&gdrom_mutex); } -- cgit v1.2.3 From 2ab4e5f44a869eaf61d7520ad6296b91f67efeed Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:46:56 -0700 Subject: nvme: ensure unquiesce on teardown The reset work is called on quiesced IO queues, so ensure these are unquiesced after a failed reset to flush out any pending requests. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b027e5e3f4ac..8eaa954aa6ed 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_dev_disable(dev, true); nvme_mark_namespaces_dead(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } -- cgit v1.2.3 From a2b5d5443fa7a0e9f26b31598bcc38c2b66300d9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:48:15 -0700 Subject: nvme: sync timeout work on failed reset Timeouts during reset will set the controller for failure, preventing the state change to LIVE. Ensure all timeout work is synced after the controller disabling completes to ensure we don't have any other tasks messing with any namespace request_queue's. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8eaa954aa6ed..bfeadecf9e15 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2777,6 +2777,7 @@ static void nvme_reset_work(struct work_struct *work) result); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_dev_disable(dev, true); + nvme_sync_queues(&dev->ctrl); nvme_mark_namespaces_dead(&dev->ctrl); nvme_unquiesce_io_queues(&dev->ctrl); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); -- cgit v1.2.3 From 4e69d4dabd2379af57b0b8fb9b0d62c23f9cd3b8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:51:02 -0700 Subject: nvme: disable controller on reset state failure If the controller is not in a RESETTING state at the point of reset work, we have to conclude the controller is being deleted. Go to the cleanup on this condition to ensure proper pairing of request_queue quiesce state. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bfeadecf9e15..c9224d39195e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.state != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); - return; + result = -ENODEV; + goto out; } /* -- cgit v1.2.3 From e836007089ba8fdf24e636ef2b007651fb4582e6 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 23 Jun 2023 14:05:23 -0400 Subject: md/raid0: add discard support for the 'original' layout We've found that using raid0 with the 'original' layout and discard enabled with different disk sizes (such that at least two zones are created) can result in data corruption. This is due to the fact that the discard handling in 'raid0_handle_discard()' assumes the 'alternate' layout. We've seen this corruption using ext4 but other filesystems are likely susceptible as well. More specifically, while multiple zones are necessary to create the corruption, the corruption may not occur with multiple zones if they layout in such a way the layout matches what the 'alternate' layout would have produced. Thus, not all raid0 devices with the 'original' layout, different size disks and discard enabled will encounter this corruption. The 3.14 kernel inadvertently changed the raid0 disk layout for different size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the same raid0 array could corrupt data. This lead to the creation of the 'original' layout (to match the pre-3.14 layout) and the 'alternate' layout (to match the post 3.14 layout) in the 5.4 kernel time frame and an option to tell the kernel which layout to use (since it couldn't be autodetected). However, when the 'original' layout was added back to 5.4 discard support for the 'original' layout was not added leading this issue. I've been able to reliably reproduce the corruption with the following test case: 1. create raid0 array with different size disks using original layout 2. mkfs 3. mount -o discard 4. create lots of files 5. remove 1/2 the files 6. fstrim -a (or just the mount point for the raid0 array) 7. umount 8. fsck -fn /dev/md0 (spews all sorts of corruptions) Let's fix this by adding proper discard support to the 'original' layout. The fix 'maps' the 'original' layout disks to the order in which they are read/written such that we can compare the disks in the same way that the current 'alternate' layout does. A 'disk_shift' field is added to 'struct strip_zone'. This could be computed on the fly in raid0_handle_discard() but by adding this field, we save some computation in the discard path. Note we could also potentially fix this by re-ordering the disks in the zones that follow the first one, and then always read/writing them using the 'alternate' layout. However, that is seen as a more substantial change, and we are attempting the least invasive fix at this time to remedy the corruption. I've verified the change using the reproducer mentioned above. Typically, the corruption is seen after less than 3 iterations, while the patch has run 500+ iterations. Cc: NeilBrown Cc: Song Liu Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.") Cc: stable@vger.kernel.org Signed-off-by: Jason Baron Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com --- drivers/md/raid0.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++------- drivers/md/raid0.h | 1 + 2 files changed, 55 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8ee9a95e25d..d1ac73fcd852 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -270,6 +270,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) goto abort; } + if (conf->layout == RAID0_ORIG_LAYOUT) { + for (i = 1; i < conf->nr_strip_zones; i++) { + sector_t first_sector = conf->strip_zone[i-1].zone_end; + + sector_div(first_sector, mddev->chunk_sectors); + zone = conf->strip_zone + i; + /* disk_shift is first disk index used in the zone */ + zone->disk_shift = sector_div(first_sector, + zone->nb_dev); + } + } + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -431,6 +443,20 @@ exit_acct_set: return ret; } +/* + * Convert disk_index to the disk order in which it is read/written. + * For example, if we have 4 disks, they are numbered 0,1,2,3. If we + * write the disks starting at disk 3, then the read/write order would + * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() + * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map + * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in + * that 'output' space to understand the read/write disk ordering. + */ +static int map_disk_shift(int disk_index, int num_disks, int disk_shift) +{ + return ((disk_index + num_disks - disk_shift) % num_disks); +} + static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; @@ -444,7 +470,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) sector_t end_disk_offset; unsigned int end_disk_index; unsigned int disk; + sector_t orig_start, orig_end; + orig_start = start; zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { @@ -458,6 +486,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) } else end = bio_end_sector(bio); + orig_end = end; if (zone != conf->strip_zone) end = end - zone[-1].zone_end; @@ -469,13 +498,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) last_stripe_index = end; sector_div(last_stripe_index, stripe_size); - start_disk_index = (int)(start - first_stripe_index * stripe_size) / - mddev->chunk_sectors; + /* In the first zone the original and alternate layouts are the same */ + if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { + sector_div(orig_start, mddev->chunk_sectors); + start_disk_index = sector_div(orig_start, zone->nb_dev); + start_disk_index = map_disk_shift(start_disk_index, + zone->nb_dev, + zone->disk_shift); + sector_div(orig_end, mddev->chunk_sectors); + end_disk_index = sector_div(orig_end, zone->nb_dev); + end_disk_index = map_disk_shift(end_disk_index, + zone->nb_dev, zone->disk_shift); + } else { + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + } start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % mddev->chunk_sectors) + first_stripe_index * mddev->chunk_sectors; - end_disk_index = (int)(end - last_stripe_index * stripe_size) / - mddev->chunk_sectors; end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % mddev->chunk_sectors) + last_stripe_index * mddev->chunk_sectors; @@ -483,18 +525,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; struct md_rdev *rdev; + int compare_disk; + + compare_disk = map_disk_shift(disk, zone->nb_dev, + zone->disk_shift); - if (disk < start_disk_index) + if (compare_disk < start_disk_index) dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) + else if (compare_disk > start_disk_index) dev_start = first_stripe_index * mddev->chunk_sectors; else dev_start = start_disk_offset; - if (disk < end_disk_index) + if (compare_disk < end_disk_index) dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) + else if (compare_disk > end_disk_index) dev_end = last_stripe_index * mddev->chunk_sectors; else dev_end = end_disk_offset; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 3816e5477db1..8cc761ca7423 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -6,6 +6,7 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ int nb_dev; /* # of devices attached to the zone */ + int disk_shift; /* start disk for the original layout */ }; /* Linux 3.14 (20d0189b101) made an unintended change to -- cgit v1.2.3