From d0dd594bedc57f9be2af2af170bf56f9c3f2376e Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 15 Jun 2023 02:49:03 -0700
Subject: nvme: Print capabilities changes just once

This current dev_info() could be very verbose and being printed very
frequently depending on some userspace application sending some specific
commands.

Just print this message once and skip it until the controller resets.
Use a controller flag (NVME_CTRL_DIRTY_CAPABILITY) to track if the
capability needs a reset.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 6 +++++-
 drivers/nvme/host/nvme.h | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 76e8f8b4098e..4b7f9edab5e8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1125,8 +1125,11 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
 		mutex_unlock(&ctrl->scan_lock);
 	}
 	if (effects & NVME_CMD_EFFECTS_CCC) {
-		dev_info(ctrl->device,
+		if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
+				      &ctrl->flags)) {
+			dev_info(ctrl->device,
 "controller capabilities changed, reset may be required to take effect.\n");
+		}
 	}
 	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
 		nvme_queue_scan(ctrl);
@@ -3280,6 +3283,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
 			return ret;
 	}
 
+	clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
 	ctrl->identified = true;
 
 	return 0;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 78308f15e090..f3182134487a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -243,6 +243,7 @@ enum nvme_ctrl_flags {
 	NVME_CTRL_STARTED_ONCE		= 2,
 	NVME_CTRL_STOPPED		= 3,
 	NVME_CTRL_SKIP_ID_CNS_CS	= 4,
+	NVME_CTRL_DIRTY_CAPABILITY	= 5,
 };
 
 struct nvme_ctrl {
-- 
cgit v1.2.3


From 9d16d264775b9a10f3f5b5db768d7f51294b2a63 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 27 Apr 2023 21:47:45 +0200
Subject: nvmet: Reorder fields in 'struct nvmet_ns'

Group some variables based on their sizes to reduce holes.
On x86_64, this shrinks the size of 'struct nvmet_ns' from 520 to 512
bytes.

When such a structure is allocated in nvmet_ns_alloc(), because of the way
memory allocation works, when 520 bytes were requested, 1024 bytes were
allocated.

So, on x86_64, this change saves 512 bytes per allocation.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/nvmet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 6cf723bc664e..8cfd60f3b564 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -79,8 +79,8 @@ struct nvmet_ns {
 	struct completion	disable_done;
 	mempool_t		*bvec_pool;
 
-	int			use_p2pmem;
 	struct pci_dev		*p2p_dev;
+	int			use_p2pmem;
 	int			pi_type;
 	int			metadata_size;
 	u8			csi;
-- 
cgit v1.2.3


From 125bfc7cd750e68c99f1d446e2c22abea08c237f Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Fri, 9 Jun 2023 17:43:20 +0800
Subject: md/raid10: fix the condition to call bio_end_io_acct()

/sys/block/[device]/queue/iostats is used to control whether to count io
stat. Write 0 to it will clear queue_flags QUEUE_FLAG_IO_STAT which means
iostats is disabled. If we disable iostats and later endable it, the io
issued during this period will be counted incorrectly, inflight will be
decreased to -1.

  //T1 set iostats
  echo 0 > /sys/block/md0/queue/iostats
   clear QUEUE_FLAG_IO_STAT

			//T2 issue io
			if (QUEUE_FLAG_IO_STAT) -> false
			 bio_start_io_acct
			  inflight++

  echo 1 > /sys/block/md0/queue/iostats
   set QUEUE_FLAG_IO_STAT

					//T3 io end
					if (QUEUE_FLAG_IO_STAT) -> true
					 bio_end_io_acct
					  inflight--	-> -1

Also, if iostats is enabled while issuing io but disabled while io end,
inflight will never be decreased.

Fix it by checking start_time when io end. If start_time is not 0, call
bio_end_io_acct().

Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting")
Signed-off-by: Li Nan <linan122@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230609094320.2397604-1-linan666@huaweicloud.com
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d0de8c9fb3cf..79067769e44b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,7 +325,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+	if (r10_bio->start_time)
 		bio_end_io_acct(bio, r10_bio->start_time);
 	bio_endio(bio);
 	/*
-- 
cgit v1.2.3


From b5a99602b74bbfa655be509c615181dd95b0719e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Fri, 16 Jun 2023 09:21:36 +0800
Subject: md/raid1-10: fix casting from randomized structure in
 raid1_submit_write()

Following build error triggered while build with clang version 17.0.0
with W=1(this can't be reporduced with gcc 13.1.0):

drivers/md/raid1-10.c:117:25: error: casting from randomized structure
pointer type 'struct block_device *' to 'struct md_rdev *'
     117 |         struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
         |                                ^

Fix this by casting 'bio->bi_bdev' to 'void *', as it used to be.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306142042.fmjfmTF8-lkp@intel.com/
Fixes: 8295efbe68c0 ("md/raid1-10: factor out a helper to submit normal write")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230616012136.3047071-1-yukuai1@huaweicloud.com
---
 drivers/md/raid1-10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 169ebe296f2d..3f22edec70e7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -116,7 +116,7 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
 
 static inline void raid1_submit_write(struct bio *bio)
 {
-	struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
+	struct md_rdev *rdev = (void *)bio->bi_bdev;
 
 	bio->bi_next = NULL;
 	bio_set_dev(bio, rdev->bdev);
-- 
cgit v1.2.3


From a1d7671910965ca9f8f0377e7e3bfd1179fba4d8 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 16 Jun 2023 22:24:04 -0700
Subject: md: use mddev->external to select holder in export_rdev()

mdadm test "10ddf-create-fail-rebuild" triggers warnings like the following

[  215.526357] ------------[ cut here ]------------
[  215.527243] WARNING: CPU: 18 PID: 1264 at block/bdev.c:617 blkdev_put+0x269/0x350
[  215.528334] Modules linked in:
[  215.528806] CPU: 18 PID: 1264 Comm: mdmon Not tainted 6.4.0-rc2+ #768
[  215.529863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
[  215.531464] RIP: 0010:blkdev_put+0x269/0x350
[  215.532167] Code: ff ff 49 8d 7d 10 e8 56 bf b8 ff 4d 8b 65 10 49 8d bc
24 58 05 00 00 e8 05 be b8 ff 41 83 ac 24 58 05 00 00 01 e9 44 ff ff ff
<0f> 0b e9 52 fe ff ff 0f 0b e9 6b fe ff ff1
[  215.534780] RSP: 0018:ffffc900040bfbf0 EFLAGS: 00010283
[  215.535635] RAX: ffff888174001000 RBX: ffff88810b1c3b00 RCX: ffffffff819a4061
[  215.536645] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88810b1c3ba0
[  215.537657] RBP: ffff88810dbde800 R08: fffffbfff0fca983 R09: fffffbfff0fca983
[  215.538674] R10: ffffc900040bfbf0 R11: fffffbfff0fca982 R12: ffff88810b1c3b38
[  215.539687] R13: ffff88810b1c3b10 R14: ffff88810dbdecb8 R15: ffff88810b1c3b00
[  215.540833] FS:  00007f2aabdff700(0000) GS:ffff888dfb400000(0000) knlGS:0000000000000000
[  215.541961] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  215.542775] CR2: 00007fa19a85d934 CR3: 000000010c076006 CR4: 0000000000370ee0
[  215.543814] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  215.544840] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  215.545885] Call Trace:
[  215.546257]  <TASK>
[  215.546608]  export_rdev.isra.63+0x71/0xe0
[  215.547338]  mddev_unlock+0x1b1/0x2d0
[  215.547898]  array_state_store+0x28d/0x450
[  215.548519]  md_attr_store+0xd7/0x150
[  215.549059]  ? __pfx_sysfs_kf_write+0x10/0x10
[  215.549702]  kernfs_fop_write_iter+0x1b9/0x260
[  215.550351]  vfs_write+0x491/0x760
[  215.550863]  ? __pfx_vfs_write+0x10/0x10
[  215.551445]  ? __fget_files+0x156/0x230
[  215.552053]  ksys_write+0xc0/0x160
[  215.552570]  ? __pfx_ksys_write+0x10/0x10
[  215.553141]  ? ktime_get_coarse_real_ts64+0xec/0x100
[  215.553878]  do_syscall_64+0x3a/0x90
[  215.554403]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[  215.555125] RIP: 0033:0x7f2aade11847
[  215.555696] Code: c3 66 90 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec
10 e8 1b fd ff ff 4c 89 e2 48 89 ee 89 df 41 89 c0 b8 01 00 00 00 0f 05
<48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 448
[  215.558398] RSP: 002b:00007f2aabdfeba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[  215.559516] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f2aade11847
[  215.560515] RDX: 0000000000000005 RSI: 0000000000438b8b RDI: 0000000000000010
[  215.561512] RBP: 0000000000438b8b R08: 0000000000000000 R09: 00007f2aaecf0060
[  215.562511] R10: 000000000e3ba40b R11: 0000000000000293 R12: 0000000000000005
[  215.563647] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000c70750
[  215.564693]  </TASK>
[  215.565029] irq event stamp: 15979
[  215.565584] hardirqs last  enabled at (15991): [<ffffffff811a7432>] __up_console_sem+0x52/0x60
[  215.566806] hardirqs last disabled at (16000): [<ffffffff811a7417>] __up_console_sem+0x37/0x60
[  215.568022] softirqs last  enabled at (15716): [<ffffffff8277a2db>] __do_softirq+0x3eb/0x531
[  215.569239] softirqs last disabled at (15711): [<ffffffff810d8f45>] irq_exit_rcu+0x115/0x160
[  215.570434] ---[ end trace 0000000000000000 ]---

This means export_rdev() calls blkdev_put with a different holder than the
one used by blkdev_get_by_dev(). This is because mddev->major_version == -2
is not a good check for external metadata. Fix this by using
mddev->external instead.

Also, do not clear mddev->external in md_clean(), as the flag might be used
later in export_rdev().

Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230617052405.305871-1-song@kernel.org
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf3733c90c47..8e7cc2e69bc9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2458,7 +2458,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
 #endif
-	blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev);
+	blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev);
 	rdev->bdev = NULL;
 	kobject_put(&rdev->kobj);
 }
@@ -6140,7 +6140,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->reshape_position = MaxSector;
-	mddev->external = 0;
+	/* we still need mddev->external in export_rdev, do not clear it yet */
 	mddev->persistent = 0;
 	mddev->level = LEVEL_NONE;
 	mddev->clevel[0] = 0;
-- 
cgit v1.2.3


From 4934b6401a812f9fe368e7d2d091cd1d120ea262 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 21 Jun 2023 22:29:33 +0800
Subject: md: fix 'delete_mutex' deadlock

Commit 3ce94ce5d05a ("md: fix duplicate filename for rdev") introduce a
new lock 'delete_mutex', and trigger a new deadlock:

t1: remove rdev			t2: sysfs writer

rdev_attr_store			rdev_attr_store
 mddev_lock
 state_store
 md_kick_rdev_from_array
  lock delete_mutex
  list_add mddev->deleting
  unlock delete_mutex
 mddev_unlock
				 mddev_lock
				 ...
  lock delete_mutex
  kobject_del
  // wait for sysfs writers to be done
				 mddev_unlock
				 lock delete_mutex
				 // wait for delete_mutex, deadlock

'delete_mutex' is used to protect the list 'mddev->deleting', turns out
that this list can be protected by 'reconfig_mutex' directly, and this
lock can be removed.

Fix this problem by removing the lock, and use 'reconfig_mutex' to
protect the list. mddev_unlock() will move this list to a local list to
be handled after 'reconfig_mutex' is dropped.

Fixes: 3ce94ce5d05a ("md: fix duplicate filename for rdev")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621142933.1395629-1-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 28 +++++++++-------------------
 drivers/md/md.h |  4 +---
 2 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e7cc2e69bc9..2e38ef421d69 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -643,7 +643,6 @@ void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
-	mutex_init(&mddev->delete_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
 	INIT_LIST_HEAD(&mddev->disks);
 	INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -749,26 +748,15 @@ static void mddev_free(struct mddev *mddev)
 
 static const struct attribute_group md_redundancy_group;
 
-static void md_free_rdev(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
 	struct md_rdev *tmp;
+	LIST_HEAD(delete);
 
-	mutex_lock(&mddev->delete_mutex);
-	if (list_empty(&mddev->deleting))
-		goto out;
+	if (!list_empty(&mddev->deleting))
+		list_splice_init(&mddev->deleting, &delete);
 
-	list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
-		list_del_init(&rdev->same_set);
-		kobject_del(&rdev->kobj);
-		export_rdev(rdev, mddev);
-	}
-out:
-	mutex_unlock(&mddev->delete_mutex);
-}
-
-void mddev_unlock(struct mddev *mddev)
-{
 	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
 		 * an access to the files will try to take reconfig_mutex
@@ -808,7 +796,11 @@ void mddev_unlock(struct mddev *mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
-	md_free_rdev(mddev);
+	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
+		list_del_init(&rdev->same_set);
+		kobject_del(&rdev->kobj);
+		export_rdev(rdev, mddev);
+	}
 
 	md_wakeup_thread(mddev->thread);
 	wake_up(&mddev->sb_wait);
@@ -2488,9 +2480,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
 	 * reconfig_mutex is held, hence it can't be called under
 	 * reconfig_mutex and it's delayed to mddev_unlock().
 	 */
-	mutex_lock(&mddev->delete_mutex);
 	list_add(&rdev->same_set, &mddev->deleting);
-	mutex_unlock(&mddev->delete_mutex);
 }
 
 static void export_array(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bfd2306bc750..1aef86bf3fc3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -531,11 +531,9 @@ struct mddev {
 
 	/*
 	 * Temporarily store rdev that will be finally removed when
-	 * reconfig_mutex is unlocked.
+	 * reconfig_mutex is unlocked, protected by reconfig_mutex.
 	 */
 	struct list_head		deleting;
-	/* Protect the deleting list */
-	struct mutex			delete_mutex;
 
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;
-- 
cgit v1.2.3


From a8d5fdd4d2702d0c7ec125bd3bbce3fc589afa67 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 21 Jun 2023 18:57:28 +0800
Subject: raid10: avoid spin_lock from fastpath from raid10_unplug()

Commit 0c0be98bbe67 ("md/raid10: prevent unnecessary calls to wake_up()
in fast path") missed one place, for example, with:

	fio -direct=1 -rw=write/randwrite -iodepth=1 ...

Plug and unplug are called for each io, then wake_up() from raid10_unplug()
will cause lock contention as well.

Avoid this contention by using wake_up_barrier() instead of wake_up(),
where spin_lock is not held if waitqueue is empty.

Fio test script:

[global]
name=random reads and writes
ioengine=libaio
direct=1
readwrite=randrw
rwmixread=70
iodepth=64
buffered=0
filename=/dev/md0
size=1G
runtime=30
time_based
randrepeat=0
norandommap
refill_buffers
ramp_time=10
bs=4k
numjobs=400
group_reporting=1
[job1]

Test result with ramdisk raid10(By Ali):

	Before this patch	With this patch
READ	IOPS=2033k		IOPS=3642k
WRITE	IOPS=871k		IOPS=1561K

By the way, in this scenario, blk_plug_cb() will be allocated and freed
for each io, this seems need to be optimized as well.

Reported-and-tested-by: Ali Gholami Rudi <aligrudi@gmail.com>
Closes: https://lore.kernel.org/all/20231606122233@laper.mirepesht/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 79067769e44b..5051149e27bb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1118,7 +1118,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		spin_unlock_irq(&conf->device_lock);
-		wake_up(&conf->wait_barrier);
+		wake_up_barrier(conf);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -1127,7 +1127,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	/* we aren't scheduling, so we can do the write-out directly. */
 	bio = bio_list_get(&plug->pending);
 	raid1_prepare_flush_writes(mddev->bitmap);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
-- 
cgit v1.2.3


From abcc0cbd49283fccd20420e86416b2475b00819c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Jun 2023 18:46:54 +0200
Subject: bcache: Alloc holder object before async registration

Allocate holder object (cache or cached_dev) before offloading the
rest of the startup to async work. This will allow us to open the block
block device with proper holder.

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Coly Li <colyli@suse.de>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20230622164658.12861-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 66 ++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 41 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e2a803683105..913dd94353b6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2448,6 +2448,7 @@ struct async_reg_args {
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
 	struct block_device *bdev;
+	void *holder;
 };
 
 static void register_bdev_worker(struct work_struct *work)
@@ -2455,22 +2456,13 @@ static void register_bdev_worker(struct work_struct *work)
 	int fail = false;
 	struct async_reg_args *args =
 		container_of(work, struct async_reg_args, reg_work.work);
-	struct cached_dev *dc;
-
-	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
-	if (!dc) {
-		fail = true;
-		put_page(virt_to_page(args->sb_disk));
-		blkdev_put(args->bdev, bcache_kobj);
-		goto out;
-	}
 
 	mutex_lock(&bch_register_lock);
-	if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
+	if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder)
+	    < 0)
 		fail = true;
 	mutex_unlock(&bch_register_lock);
 
-out:
 	if (fail)
 		pr_info("error %s: fail to register backing device\n",
 			args->path);
@@ -2485,21 +2477,11 @@ static void register_cache_worker(struct work_struct *work)
 	int fail = false;
 	struct async_reg_args *args =
 		container_of(work, struct async_reg_args, reg_work.work);
-	struct cache *ca;
-
-	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-	if (!ca) {
-		fail = true;
-		put_page(virt_to_page(args->sb_disk));
-		blkdev_put(args->bdev, bcache_kobj);
-		goto out;
-	}
 
 	/* blkdev_put() will be called in bch_cache_release() */
-	if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
+	if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder))
 		fail = true;
 
-out:
 	if (fail)
 		pr_info("error %s: fail to register cache device\n",
 			args->path);
@@ -2520,6 +2502,13 @@ static void register_device_async(struct async_reg_args *args)
 	queue_delayed_work(system_wq, &args->reg_work, 10);
 }
 
+static void *alloc_holder_object(struct cache_sb *sb)
+{
+	if (SB_IS_BDEV(sb))
+		return kzalloc(sizeof(struct cached_dev), GFP_KERNEL);
+	return kzalloc(sizeof(struct cache), GFP_KERNEL);
+}
+
 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
@@ -2528,6 +2517,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
 	struct block_device *bdev;
+	void *holder;
 	ssize_t ret;
 	bool async_registration = false;
 
@@ -2585,6 +2575,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (err)
 		goto out_blkdev_put;
 
+	holder = alloc_holder_object(sb);
+	if (!holder) {
+		ret = -ENOMEM;
+		err = "cannot allocate memory";
+		goto out_put_sb_page;
+	}
+
 	err = "failed to register device";
 
 	if (async_registration) {
@@ -2595,44 +2592,29 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		if (!args) {
 			ret = -ENOMEM;
 			err = "cannot allocate memory";
-			goto out_put_sb_page;
+			goto out_free_holder;
 		}
 
 		args->path	= path;
 		args->sb	= sb;
 		args->sb_disk	= sb_disk;
 		args->bdev	= bdev;
+		args->holder	= holder;
 		register_device_async(args);
 		/* No wait and returns to user space */
 		goto async_done;
 	}
 
 	if (SB_IS_BDEV(sb)) {
-		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
-
-		if (!dc) {
-			ret = -ENOMEM;
-			err = "cannot allocate memory";
-			goto out_put_sb_page;
-		}
-
 		mutex_lock(&bch_register_lock);
-		ret = register_bdev(sb, sb_disk, bdev, dc);
+		ret = register_bdev(sb, sb_disk, bdev, holder);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
 		if (ret < 0)
 			goto out_free_sb;
 	} else {
-		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-
-		if (!ca) {
-			ret = -ENOMEM;
-			err = "cannot allocate memory";
-			goto out_put_sb_page;
-		}
-
 		/* blkdev_put() will be called in bch_cache_release() */
-		ret = register_cache(sb, sb_disk, bdev, ca);
+		ret = register_cache(sb, sb_disk, bdev, holder);
 		if (ret)
 			goto out_free_sb;
 	}
@@ -2644,6 +2626,8 @@ done:
 async_done:
 	return size;
 
+out_free_holder:
+	kfree(holder);
 out_put_sb_page:
 	put_page(virt_to_page(sb_disk));
 out_blkdev_put:
-- 
cgit v1.2.3


From 2c5555983bd27d24162534b682b10654639a5576 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Jun 2023 18:46:55 +0200
Subject: bcache: Fix bcache device claiming

Commit 2736e8eeb0cc ("block: use the holder as indication for exclusive
opens") introduced a change that blkdev_put() has to get exclusive
holder of the bdev as an argument. However it overlooked that
register_bdev() and register_cache() overwrite the bdev->bd_holder field
in the block device to point to the real owning object which was not
available at the time we called blkdev_get_by_path(). Messing with bdev
internals like this is a layering violation and it also causes
blkdev_put() to issue warning about mismatching holders.

Fix bcache to reopen the block device with appropriate holder once it is
available which also restores the behavior that multiple bcache caches
cannot claim the same device which was broken by commit 29499ab060fe
("bcache: don't pass a stack address to blkdev_get_by_path").

Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Coly Li <colyli@suse.de>
Link: https://lore.kernel.org/r/20230622164658.12861-2-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 65 +++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 913dd94353b6..0ae2b3676293 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl)
 		put_page(virt_to_page(dc->sb_disk));
 
 	if (!IS_ERR_OR_NULL(dc->bdev))
-		blkdev_put(dc->bdev, bcache_kobj);
+		blkdev_put(dc->bdev, dc);
 
 	wake_up(&unregister_wait);
 
@@ -1453,7 +1453,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 
 	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
 	dc->bdev = bdev;
-	dc->bdev->bd_holder = dc;
 	dc->sb_disk = sb_disk;
 
 	if (cached_dev_init(dc, sb->block_size << 9))
@@ -2218,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj)
 		put_page(virt_to_page(ca->sb_disk));
 
 	if (!IS_ERR_OR_NULL(ca->bdev))
-		blkdev_put(ca->bdev, bcache_kobj);
+		blkdev_put(ca->bdev, ca);
 
 	kfree(ca);
 	module_put(THIS_MODULE);
@@ -2345,7 +2344,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 
 	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
 	ca->bdev = bdev;
-	ca->bdev->bd_holder = ca;
 	ca->sb_disk = sb_disk;
 
 	if (bdev_max_discard_sectors((bdev)))
@@ -2359,7 +2357,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 		 * call blkdev_put() to bdev in bch_cache_release(). So we
 		 * explicitly call blkdev_put() here.
 		 */
-		blkdev_put(bdev, bcache_kobj);
+		blkdev_put(bdev, ca);
 		if (ret == -ENOMEM)
 			err = "cache_alloc(): -ENOMEM";
 		else if (ret == -EPERM)
@@ -2516,10 +2514,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	char *path = NULL;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
-	struct block_device *bdev;
-	void *holder;
+	struct block_device *bdev, *bdev2;
+	void *holder = NULL;
 	ssize_t ret;
 	bool async_registration = false;
+	bool quiet = false;
 
 #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
 	async_registration = true;
@@ -2548,24 +2547,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	ret = -EINVAL;
 	err = "failed to open device";
-	bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE,
-				  bcache_kobj, NULL);
-	if (IS_ERR(bdev)) {
-		if (bdev == ERR_PTR(-EBUSY)) {
-			dev_t dev;
-
-			mutex_lock(&bch_register_lock);
-			if (lookup_bdev(strim(path), &dev) == 0 &&
-			    bch_is_open(dev))
-				err = "device already registered";
-			else
-				err = "device busy";
-			mutex_unlock(&bch_register_lock);
-			if (attr == &ksysfs_register_quiet)
-				goto done;
-		}
+	bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+	if (IS_ERR(bdev))
 		goto out_free_sb;
-	}
 
 	err = "failed to set blocksize";
 	if (set_blocksize(bdev, 4096))
@@ -2582,6 +2566,32 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		goto out_put_sb_page;
 	}
 
+	/* Now reopen in exclusive mode with proper holder */
+	bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				  holder, NULL);
+	blkdev_put(bdev, NULL);
+	bdev = bdev2;
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		bdev = NULL;
+		if (ret == -EBUSY) {
+			dev_t dev;
+
+			mutex_lock(&bch_register_lock);
+			if (lookup_bdev(strim(path), &dev) == 0 &&
+			    bch_is_open(dev))
+				err = "device already registered";
+			else
+				err = "device busy";
+			mutex_unlock(&bch_register_lock);
+			if (attr == &ksysfs_register_quiet) {
+				quiet = true;
+				ret = size;
+			}
+		}
+		goto out_free_holder;
+	}
+
 	err = "failed to register device";
 
 	if (async_registration) {
@@ -2619,7 +2629,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			goto out_free_sb;
 	}
 
-done:
 	kfree(sb);
 	kfree(path);
 	module_put(THIS_MODULE);
@@ -2631,7 +2640,8 @@ out_free_holder:
 out_put_sb_page:
 	put_page(virt_to_page(sb_disk));
 out_blkdev_put:
-	blkdev_put(bdev, register_bcache);
+	if (bdev)
+		blkdev_put(bdev, holder);
 out_free_sb:
 	kfree(sb);
 out_free_path:
@@ -2640,7 +2650,8 @@ out_free_path:
 out_module_put:
 	module_put(THIS_MODULE);
 out:
-	pr_info("error %s: %s\n", path?path:"", err);
+	if (!quiet)
+		pr_info("error %s: %s\n", path?path:"", err);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 86da1bae4c64ab3dcbdda0c77ce37c9bf47a501f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 22 Jun 2023 16:53:07 +0900
Subject: nvme: host: fix command name spelling

Correctly spell "Zeroes" in nvme_cmd_write_zeroes command name.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/constants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index bc523ca02254..44de4f6a4f4c 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -12,7 +12,7 @@ static const char * const nvme_ops[] = {
 	[nvme_cmd_read] = "Read",
 	[nvme_cmd_write_uncor] = "Write Uncorrectable",
 	[nvme_cmd_compare] = "Compare",
-	[nvme_cmd_write_zeroes] = "Write Zeros",
+	[nvme_cmd_write_zeroes] = "Write Zeroes",
 	[nvme_cmd_dsm] = "Dataset Management",
 	[nvme_cmd_verify] = "Verify",
 	[nvme_cmd_resv_register] = "Reservation Register",
-- 
cgit v1.2.3


From 99160af413b4ff1c3b4741e8a7583f8e7197f201 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 20 Jun 2023 16:07:36 +0300
Subject: nvme-mpath: fix I/O failure with EAGAIN when failing over I/O

It is possible that the next available path we failover to, happens to
be frozen (for example if it is during connection establishment). If
the original I/O was set with NOWAIT, this cause the I/O to unnecessarily
fail because the request queue cannot be entered, hence the I/O fails with
EAGAIN.

The NOWAIT restriction that was originally set for the I/O is no longer
relevant or needed because this is the nvme requeue context. Hence we
clear the REQ_NOWAIT flag when failing over I/O.

This fix a simple test case of nvme controller reset during I/O when the
multipath device that has only a single path and I/O fails with "Resource
temporarily unavailable" errno. Note that this reproduces with io_uring
which by default sets IOCB_NOWAIT by default.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/multipath.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 91a9a55227fa..3acb47760e24 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -106,6 +106,14 @@ void nvme_failover_req(struct request *req)
 			bio->bi_opf &= ~REQ_POLLED;
 			bio->bi_cookie = BLK_QC_T_NONE;
 		}
+		/*
+		 * The alternate request queue that we may end up submitting
+		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
+		 * will fail the I/O immediately with EAGAIN to the issuer.
+		 * We are not in the issuer context which cannot block. Clear
+		 * the flag to avoid spurious EAGAIN I/O failures.
+		 */
+		bio->bi_opf &= ~REQ_NOWAIT;
 	}
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-- 
cgit v1.2.3


From 9408d8a37e6cce8803681ab816383450a056c3a9 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 12 Jun 2023 12:03:43 -0700
Subject: nvme: improved uring polling

Drivers can poll requests directly, so use that. We just need to ensure
the driver's request was allocated from a polled hctx, so a special
driver flag is added to struct io_uring_cmd.

The allows unshared and multipath namespaces to use the same polling
callback, and multipath is guaranteed to get the same queue as the
command was submitted on. Previously multipath polling might check a
different path and poll the wrong info.

The other bonus is we don't need a bio payload in order to poll,
allowing commands like 'flush' and 'write zeroes' to be submitted on the
same high priority queue as read and write commands.

Finally, using the request based polling skips the unnecessary bio
overhead.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230612190343.2087040-3-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c     | 70 ++++++++++++-------------------------------
 drivers/nvme/host/multipath.c |  2 +-
 drivers/nvme/host/nvme.h      |  2 --
 include/uapi/linux/io_uring.h |  2 ++
 4 files changed, 22 insertions(+), 54 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 2130ad65b58c..5c3250f36ce7 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -505,7 +505,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -518,10 +517,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
-	else
+	} else {
 		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	}
 
 	return RQ_END_IO_FREE;
 }
@@ -531,7 +532,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	pdu->req = req;
@@ -540,10 +540,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED);
-	else
+	} else {
 		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb);
+	}
 
 	return RQ_END_IO_NONE;
 }
@@ -599,7 +601,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (issue_flags & IO_URING_F_IOPOLL)
 		rq_flags |= REQ_POLLED;
 
-retry:
 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -613,17 +614,11 @@ retry:
 			return ret;
 	}
 
-	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
-		if (unlikely(!req->bio)) {
-			/* we can't poll this, so alloc regular req instead */
-			blk_mq_free_request(req);
-			rq_flags &= ~REQ_POLLED;
-			goto retry;
-		} else {
-			WRITE_ONCE(ioucmd->cookie, req->bio);
-			req->bio->bi_opf |= REQ_POLLED;
-		}
+	if (blk_rq_is_poll(req)) {
+		ioucmd->flags |= IORING_URING_CMD_POLLED;
+		WRITE_ONCE(ioucmd->cookie, req);
 	}
+
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
 	pdu->meta_len = d.metadata_len;
@@ -785,18 +780,16 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 				 struct io_comp_batch *iob,
 				 unsigned int poll_flags)
 {
-	struct bio *bio;
+	struct request *req;
 	int ret = 0;
-	struct nvme_ns *ns;
-	struct request_queue *q;
+
+	if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
+		return 0;
 
 	rcu_read_lock();
-	bio = READ_ONCE(ioucmd->cookie);
-	ns = container_of(file_inode(ioucmd->file)->i_cdev,
-			struct nvme_ns, cdev);
-	q = ns->queue;
-	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
-		ret = bio_poll(bio, iob, poll_flags);
+	req = READ_ONCE(ioucmd->cookie);
+	if (req && blk_rq_is_poll(req))
+		ret = blk_rq_poll(req, iob, poll_flags);
 	rcu_read_unlock();
 	return ret;
 }
@@ -890,31 +883,6 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
-
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-				      struct io_comp_batch *iob,
-				      unsigned int poll_flags)
-{
-	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
-	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
-	int srcu_idx = srcu_read_lock(&head->srcu);
-	struct nvme_ns *ns = nvme_find_path(head);
-	struct bio *bio;
-	int ret = 0;
-	struct request_queue *q;
-
-	if (ns) {
-		rcu_read_lock();
-		bio = READ_ONCE(ioucmd->cookie);
-		q = ns->queue;
-		if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio
-				&& bio->bi_bdev)
-			ret = bio_poll(bio, iob, poll_flags);
-		rcu_read_unlock();
-	}
-	srcu_read_unlock(&head->srcu, srcu_idx);
-	return ret;
-}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 98001eebd275..5aa1592849af 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -470,7 +470,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
-	.uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
+	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a98c14c552a..791cafd9910a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -854,8 +854,6 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 		struct io_comp_batch *iob, unsigned int poll_flags);
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-		struct io_comp_batch *iob, unsigned int poll_flags);
 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f222d263bc55..08720c7bd92f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -244,8 +244,10 @@ enum io_uring_op {
  * sqe->uring_cmd_flags
  * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
+ * IORING_URING_CMD_POLLED	driver use only
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)
+#define IORING_URING_CMD_POLLED	(1U << 31)
 
 
 /*
-- 
cgit v1.2.3


From a587b046ce921cc1805de6f0f000209b3644cadd Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 29 Jun 2023 01:30:47 -0700
Subject: cdrom/gdrom: Fix build error

Commit 7ae24fcee992 ("cdrom: remove the unused mode argument to
cdrom_release") was supposed to remove an unused argument from
cdrom_release(). but instead removed a used argument from
cdrom_open(). This results in the following build error.

drivers/cdrom/gdrom.c: In function 'gdrom_bdops_open':
drivers/cdrom/gdrom.c:484:15: error: too few arguments to function 'cdrom_open'

drivers/cdrom/gdrom.c: In function 'gdrom_bdops_release':
drivers/cdrom/gdrom.c:492:35: error: 'mode' undeclared

Fix it up.

Fixes: 7ae24fcee992 ("cdrom: remove the unused mode argument to cdrom_release")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20230629083047.3487172-1-linux@roeck-us.net
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/cdrom/gdrom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 3a46e27479ff..d668b174ace9 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -481,7 +481,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode)
 	disk_check_media_change(disk);
 
 	mutex_lock(&gdrom_mutex);
-	ret = cdrom_open(gd.cd_info);
+	ret = cdrom_open(gd.cd_info, mode);
 	mutex_unlock(&gdrom_mutex);
 	return ret;
 }
@@ -489,7 +489,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode)
 static void gdrom_bdops_release(struct gendisk *disk)
 {
 	mutex_lock(&gdrom_mutex);
-	cdrom_release(gd.cd_info, mode);
+	cdrom_release(gd.cd_info);
 	mutex_unlock(&gdrom_mutex);
 }
 
-- 
cgit v1.2.3


From 2ab4e5f44a869eaf61d7520ad6296b91f67efeed Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 28 Jun 2023 07:46:56 -0700
Subject: nvme: ensure unquiesce on teardown

The reset work is called on quiesced IO queues, so ensure these are
unquiesced after a failed reset to flush out any pending requests.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b027e5e3f4ac..8eaa954aa6ed 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work)
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 	nvme_dev_disable(dev, true);
 	nvme_mark_namespaces_dead(&dev->ctrl);
+	nvme_unquiesce_io_queues(&dev->ctrl);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
 }
 
-- 
cgit v1.2.3


From a2b5d5443fa7a0e9f26b31598bcc38c2b66300d9 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 28 Jun 2023 07:48:15 -0700
Subject: nvme: sync timeout work on failed reset

Timeouts during reset will set the controller for failure, preventing
the state change to LIVE. Ensure all timeout work is synced after the
controller disabling completes to ensure we don't have any other tasks
messing with any namespace request_queue's.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8eaa954aa6ed..bfeadecf9e15 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2777,6 +2777,7 @@ static void nvme_reset_work(struct work_struct *work)
 		 result);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 	nvme_dev_disable(dev, true);
+	nvme_sync_queues(&dev->ctrl);
 	nvme_mark_namespaces_dead(&dev->ctrl);
 	nvme_unquiesce_io_queues(&dev->ctrl);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
-- 
cgit v1.2.3


From 4e69d4dabd2379af57b0b8fb9b0d62c23f9cd3b8 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 28 Jun 2023 07:51:02 -0700
Subject: nvme: disable controller on reset state failure

If the controller is not in a RESETTING state at the point of reset
work, we have to conclude the controller is being deleted. Go to the
cleanup on this condition to ensure proper pairing of request_queue
quiesce state.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index bfeadecf9e15..c9224d39195e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
 		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
 			 dev->ctrl.state);
-		return;
+		result = -ENODEV;
+		goto out;
 	}
 
 	/*
-- 
cgit v1.2.3


From e836007089ba8fdf24e636ef2b007651fb4582e6 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 23 Jun 2023 14:05:23 -0400
Subject: md/raid0: add discard support for the 'original' layout

We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.

More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.

The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.

I've been able to reliably reproduce the corruption with the following
test case:

1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)

Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.

Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.

I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.

Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
---
 drivers/md/raid0.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid0.h |  1 +
 2 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8ee9a95e25d..d1ac73fcd852 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -270,6 +270,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		goto abort;
 	}
 
+	if (conf->layout == RAID0_ORIG_LAYOUT) {
+		for (i = 1; i < conf->nr_strip_zones; i++) {
+			sector_t first_sector = conf->strip_zone[i-1].zone_end;
+
+			sector_div(first_sector, mddev->chunk_sectors);
+			zone = conf->strip_zone + i;
+			/* disk_shift is first disk index used in the zone */
+			zone->disk_shift = sector_div(first_sector,
+						      zone->nb_dev);
+		}
+	}
+
 	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
@@ -431,6 +443,20 @@ exit_acct_set:
 	return ret;
 }
 
+/*
+ * Convert disk_index to the disk order in which it is read/written.
+ *  For example, if we have 4 disks, they are numbered 0,1,2,3. If we
+ *  write the disks starting at disk 3, then the read/write order would
+ *  be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift()
+ *  to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map
+ *  to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in
+ *  that 'output' space to understand the read/write disk ordering.
+ */
+static int map_disk_shift(int disk_index, int num_disks, int disk_shift)
+{
+	return ((disk_index + num_disks - disk_shift) % num_disks);
+}
+
 static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 {
 	struct r0conf *conf = mddev->private;
@@ -444,7 +470,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	sector_t end_disk_offset;
 	unsigned int end_disk_index;
 	unsigned int disk;
+	sector_t orig_start, orig_end;
 
+	orig_start = start;
 	zone = find_zone(conf, &start);
 
 	if (bio_end_sector(bio) > zone->zone_end) {
@@ -458,6 +486,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	} else
 		end = bio_end_sector(bio);
 
+	orig_end = end;
 	if (zone != conf->strip_zone)
 		end = end - zone[-1].zone_end;
 
@@ -469,13 +498,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	last_stripe_index = end;
 	sector_div(last_stripe_index, stripe_size);
 
-	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
-		mddev->chunk_sectors;
+	/* In the first zone the original and alternate layouts are the same */
+	if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) {
+		sector_div(orig_start, mddev->chunk_sectors);
+		start_disk_index = sector_div(orig_start, zone->nb_dev);
+		start_disk_index = map_disk_shift(start_disk_index,
+						  zone->nb_dev,
+						  zone->disk_shift);
+		sector_div(orig_end, mddev->chunk_sectors);
+		end_disk_index = sector_div(orig_end, zone->nb_dev);
+		end_disk_index = map_disk_shift(end_disk_index,
+						zone->nb_dev, zone->disk_shift);
+	} else {
+		start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+			mddev->chunk_sectors;
+		end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+			mddev->chunk_sectors;
+	}
 	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
 		mddev->chunk_sectors) +
 		first_stripe_index * mddev->chunk_sectors;
-	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
-		mddev->chunk_sectors;
 	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
 		mddev->chunk_sectors) +
 		last_stripe_index * mddev->chunk_sectors;
@@ -483,18 +525,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	for (disk = 0; disk < zone->nb_dev; disk++) {
 		sector_t dev_start, dev_end;
 		struct md_rdev *rdev;
+		int compare_disk;
+
+		compare_disk = map_disk_shift(disk, zone->nb_dev,
+					      zone->disk_shift);
 
-		if (disk < start_disk_index)
+		if (compare_disk < start_disk_index)
 			dev_start = (first_stripe_index + 1) *
 				mddev->chunk_sectors;
-		else if (disk > start_disk_index)
+		else if (compare_disk > start_disk_index)
 			dev_start = first_stripe_index * mddev->chunk_sectors;
 		else
 			dev_start = start_disk_offset;
 
-		if (disk < end_disk_index)
+		if (compare_disk < end_disk_index)
 			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
-		else if (disk > end_disk_index)
+		else if (compare_disk > end_disk_index)
 			dev_end = last_stripe_index * mddev->chunk_sectors;
 		else
 			dev_end = end_disk_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 3816e5477db1..8cc761ca7423 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -6,6 +6,7 @@ struct strip_zone {
 	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	int	 nb_dev;	/* # of devices attached to the zone */
+	int	 disk_shift;	/* start disk for the original layout */
 };
 
 /* Linux 3.14 (20d0189b101) made an unintended change to
-- 
cgit v1.2.3