From 37f0dc2ec78af0c3f35dd05578763de059f6fe77 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Mar 2023 07:13:45 +0800 Subject: nvme: fix handling single range discard request When investigating one customer report on warning in nvme_setup_discard, we observed the controller(nvme/tcp) actually exposes queue_max_discard_segments(req->q) == 1. Obviously the current code can't handle this situation, since contiguity merge like normal RW request is taken. Fix the issue by building range from request sector/nr_sectors directly. Fixes: b35ba01ea697 ("nvme: support ranged discard requests") Signed-off-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2730b116dc6..d4be525f8100 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -781,16 +781,26 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = page_address(ns->ctrl->discard_page); } - __rq_for_each_bio(bio, req) { - u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - - if (n < segments) { - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); + if (queue_max_discard_segments(req->q) == 1) { + u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req)); + u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9); + + range[0].cattr = cpu_to_le32(0); + range[0].nlb = cpu_to_le32(nlb); + range[0].slba = cpu_to_le64(slba); + n = 1; + } else { + __rq_for_each_bio(bio, req) { + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } + n++; } - n++; } if (WARN_ON_ONCE(n != segments)) { -- cgit v1.2.3 From a61d265533b7fe0026a02a49916aa564ffe38e4c Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Wed, 8 Mar 2023 18:05:08 -0300 Subject: nvme-pci: fixing memory leak in probe teardown path In case the nvme_probe teardown path is triggered the ctrl ref count does not reach 0 thus creating a memory leak upon failure of nvme_probe. Signed-off-by: Irvin Cote Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b95c94ee40f..e77a8a873b1a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3073,6 +3073,7 @@ out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: nvme_uninit_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return result; } -- cgit v1.2.3 From 9630d80655bfe7e62e4aff2889dc4eae7ceeb887 Mon Sep 17 00:00:00 2001 From: Elmer Miroslav Mosher Golovin Date: Wed, 8 Mar 2023 19:19:29 +0300 Subject: nvme-pci: add NVME_QUIRK_BOGUS_NID for Netac NV3000 Added a quirk to fix the Netac NV3000 SSD reporting duplicate NGUIDs. Cc: Signed-off-by: Elmer Miroslav Mosher Golovin Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e77a8a873b1a..8a536d5300a4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3416,6 +3416,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ -- cgit v1.2.3 From b65d44fa0fe072c91bf41cd8756baa2b4c77eff2 Mon Sep 17 00:00:00 2001 From: Philipp Geulen Date: Mon, 13 Mar 2023 11:11:50 +0100 Subject: nvme-pci: add NVME_QUIRK_BOGUS_NID for Lexar NM620 Added a quirk to fix Lexar NM620 1TB SSD reporting duplicate NGUIDs. Signed-off-by: Philipp Geulen Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8a536d5300a4..b615906263f3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3438,6 +3438,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), -- cgit v1.2.3 From a3406352c54fbc476f4f6b98159c3ea1c7dbb6fc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Mar 2023 10:56:22 +0200 Subject: nvme-tcp: fix opcode reporting in the timeout handler For non in-capsule writes we reuse the request pdu space for a h2cdata pdu in order to avoid over allocating space (either preallocate or dynamically upon receving an r2t pdu). However if the request times out the core expects to find the opcode in the start of the request, which we override. In order to prevent that, without sacrificing additional 24 bytes per request, we just use the tail of the command pdu space instead (last 24 bytes from the 72 bytes command pdu). That should make the command opcode always available, and we get away from allocating more space. If in the future we would need the last 24 bytes of the nvme command available we would need to allocate a dedicated space for it in the request, but until then we can avoid doing so. Reported-by: Akinobu Mita Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by: Akinobu Mita Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7723a4989524..2e174fad57d7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -208,6 +208,18 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } +static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req) +{ + return req->pdu; +} + +static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req) +{ + /* use the pdu space in the back for the data pdu */ + return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - + sizeof(struct nvme_tcp_data_pdu); +} + static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) { if (nvme_is_fabrics(req->req.cmd)) @@ -614,7 +626,7 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { - struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req); struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); u32 h2cdata_sent = req->pdu_len; @@ -1038,7 +1050,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; @@ -1077,7 +1089,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_data_pdu *pdu = req->pdu; + struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; @@ -2284,7 +2296,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype; int qid = nvme_tcp_queue_id(req->queue); @@ -2323,7 +2335,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_command *c = &pdu->cmd; c->common.flags |= NVME_CMD_SGL_METABUF; @@ -2343,7 +2355,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_tcp_queue *queue = req->queue; u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; -- cgit v1.2.3 From 7e87965d3807ab1f518ef2365f91d5ba6b0c5abe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Mar 2023 10:56:23 +0200 Subject: nvme-tcp: add nvme-tcp pdu size build protection Make sure that we don't somehow mess up the wire structures in the spec. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2e174fad57d7..42c0598c31f2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2694,6 +2694,15 @@ static struct nvmf_transport_ops nvme_tcp_transport = { static int __init nvme_tcp_init_module(void) { + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); + BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); + BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!nvme_tcp_wq) -- cgit v1.2.3 From 6173a77b7e9d3e202bdb9897b23f2a8afe7bf286 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 6 Mar 2023 10:13:13 +0900 Subject: nvmet: avoid potential UAF in nvmet_req_complete() An nvme target ->queue_response() operation implementation may free the request passed as argument. Such implementation potentially could result in a use after free of the request pointer when percpu_ref_put() is called in nvmet_req_complete(). Avoid such problem by using a local variable to save the sq pointer before calling __nvmet_req_complete(), thus avoiding dereferencing the req pointer after that function call. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index f66ed13d7c11..3935165048e7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -756,8 +756,10 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) void nvmet_req_complete(struct nvmet_req *req, u16 status) { + struct nvmet_sq *sq = req->sq; + __nvmet_req_complete(req, status); - percpu_ref_put(&req->sq->ref); + percpu_ref_put(&sq->ref); } EXPORT_SYMBOL_GPL(nvmet_req_complete); -- cgit v1.2.3 From 5f27571382ca42daa3e3d40d1b252bf18c2b61d2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 23 Feb 2023 17:12:26 +0800 Subject: block: count 'ios' and 'sectors' when io is done for bio-based device While using iostat for raid, I observed very strange 'await' occasionally, and turns out it's due to that 'ios' and 'sectors' is counted in bdev_start_io_acct(), while 'nsecs' is counted in bdev_end_io_acct(). I'm not sure why they are ccounted like that but I think this behaviour is obviously wrong because user will get wrong disk stats. Fix the problem by counting 'ios' and 'sectors' when io is done, like what rq-based device does. Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help function") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230223091226.1135678-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++++---------- drivers/md/dm.c | 6 +++--- drivers/nvme/host/multipath.c | 8 ++++---- include/linux/blkdev.h | 5 ++--- 4 files changed, 15 insertions(+), 20 deletions(-) (limited to 'drivers/nvme') diff --git a/block/blk-core.c b/block/blk-core.c index 9e5e0277a4d9..42926e6cb83c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -959,16 +959,11 @@ again: } } -unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, enum req_op op, +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { - const int sgrp = op_stat_group(op); - part_stat_lock(); update_io_ticks(bdev, start_time, false); - part_stat_inc(bdev, ios[sgrp]); - part_stat_add(bdev, sectors[sgrp], sectors); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -984,13 +979,12 @@ EXPORT_SYMBOL(bdev_start_io_acct); */ unsigned long bio_start_io_acct(struct bio *bio) { - return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), jiffies); + return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, - unsigned long start_time) + unsigned int sectors, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); @@ -998,6 +992,8 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op, part_stat_lock(); update_io_ticks(bdev, now, true); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -1007,7 +1003,7 @@ EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev) { - bdev_end_io_acct(orig_bdev, bio_op(bio), start_time); + bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index eace45a18d45..f5cc330bb549 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end) sectors = io->sectors; if (!end) - bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), - start_time); + bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time); else - bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors, + start_time); if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&md->stats))) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc39d01e7b63..9171452e2f6d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq) return; nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; - nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, - blk_rq_bytes(rq) >> SECTOR_SHIFT, - req_op(rq), jiffies); + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq), + jiffies); } EXPORT_SYMBOL_GPL(nvme_mpath_start_request); @@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq) if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) return; bdev_end_io_acct(ns->head->disk->part0, req_op(rq), - nvme_req(rq)->start_time); + blk_rq_bytes(rq) >> SECTOR_SHIFT, + nvme_req(rq)->start_time); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d1aee08f8c18..941304f17492 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, enum req_op op, +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, - unsigned long start_time); + unsigned int sectors, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, -- cgit v1.2.3 From 9d2789ac9d60c049d26ef6d3005d9c94c5a559e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Mar 2023 20:01:25 -0600 Subject: block/io_uring: pass in issue_flags for uring_cmd task_work handling io_uring_cmd_done() currently assumes that the uring_lock is held when invoked, and while it generally is, this is not guaranteed. Pass in the issue_flags associated with it, so that we have IO_URING_F_UNLOCKED available to be able to lock the CQ ring appropriately when completing events. Cc: stable@vger.kernel.org Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 31 ++++++++++++++++++------------- drivers/nvme/host/ioctl.c | 14 ++++++++------ include/linux/io_uring.h | 11 ++++++----- io_uring/uring_cmd.c | 10 ++++++---- 4 files changed, 38 insertions(+), 28 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index fb5a557afde8..c73cc57ec547 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -715,7 +715,8 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, } } -static void ubq_complete_io_cmd(struct ublk_io *io, int res) +static void ubq_complete_io_cmd(struct ublk_io *io, int res, + unsigned issue_flags) { /* mark this cmd owned by ublksrv */ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; @@ -727,7 +728,7 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res) io->flags &= ~UBLK_IO_FLAG_ACTIVE; /* tell ublksrv one io request is coming */ - io_uring_cmd_done(io->cmd, res, 0); + io_uring_cmd_done(io->cmd, res, 0, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -744,7 +745,8 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } -static inline void __ublk_rq_task_work(struct request *req) +static inline void __ublk_rq_task_work(struct request *req, + unsigned issue_flags) { struct ublk_queue *ubq = req->mq_hctx->driver_data; int tag = req->tag; @@ -782,7 +784,7 @@ static inline void __ublk_rq_task_work(struct request *req) pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags); - ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA); + ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); return; } /* @@ -820,17 +822,18 @@ static inline void __ublk_rq_task_work(struct request *req) mapped_bytes >> 9; } - ubq_complete_io_cmd(io, UBLK_IO_RES_OK); + ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } -static inline void ublk_forward_io_cmds(struct ublk_queue *ubq) +static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, + unsigned issue_flags) { struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); struct ublk_rq_data *data, *tmp; io_cmds = llist_reverse_order(io_cmds); llist_for_each_entry_safe(data, tmp, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); + __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); } static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) @@ -842,12 +845,12 @@ static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - ublk_forward_io_cmds(ubq); + ublk_forward_io_cmds(ubq, issue_flags); } static void ublk_rq_task_work_fn(struct callback_head *work) @@ -856,8 +859,9 @@ static void ublk_rq_task_work_fn(struct callback_head *work) struct ublk_rq_data, work); struct request *req = blk_mq_rq_from_pdu(data); struct ublk_queue *ubq = req->mq_hctx->driver_data; + unsigned issue_flags = IO_URING_F_UNLOCKED; - ublk_forward_io_cmds(ubq); + ublk_forward_io_cmds(ubq, issue_flags); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1111,7 +1115,8 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) struct ublk_io *io = &ubq->ios[i]; if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, + IO_URING_F_UNLOCKED); } /* all io commands are canceled */ @@ -1351,7 +1356,7 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0); + io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", __func__, cmd_op, tag, ret, io->flags); return -EIOCBQUEUED; @@ -2234,7 +2239,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ub) ublk_put_device(ub); out: - io_uring_cmd_done(cmd, ret, 0); + io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); return -EIOCBQUEUED; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 723e7d5b778f..d24ea2e05156 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -464,7 +464,8 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; } -static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd) +static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, + unsigned issue_flags) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); struct request *req = pdu->req; @@ -485,17 +486,18 @@ static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd) blk_rq_unmap_user(req->bio); blk_mq_free_request(req); - io_uring_cmd_done(ioucmd, status, result); + io_uring_cmd_done(ioucmd, status, result, issue_flags); } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, + unsigned issue_flags) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result); + io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, @@ -517,7 +519,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, * Otherwise, move the completion to task work. */ if (cookie != NULL && blk_rq_is_poll(req)) - nvme_uring_task_cb(ioucmd); + nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); @@ -539,7 +541,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, * Otherwise, move the completion to task work. */ if (cookie != NULL && blk_rq_is_poll(req)) - nvme_uring_task_meta_cb(ioucmd); + nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb); diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 934e5dd4ccc0..35b9328ca335 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -27,7 +27,7 @@ struct io_uring_cmd { const void *cmd; union { /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd); + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); /* used for polled completion */ void *cookie; }; @@ -39,9 +39,10 @@ struct io_uring_cmd { #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)); + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); @@ -72,11 +73,11 @@ static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return -EOPNOTSUPP; } static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2) + ssize_t ret2, unsigned issue_flags) { } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } static inline struct sock *io_uring_get_socket(struct file *file) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 446a189b78b0..e535e8db01e3 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -15,12 +15,13 @@ static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; - ioucmd->task_work_cb(ioucmd); + ioucmd->task_work_cb(ioucmd, issue_flags); } void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -42,7 +43,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, + unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -56,7 +58,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); else - io_req_complete_post(req, 0); + io_req_complete_post(req, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -- cgit v1.2.3 From def84ab600b71ea3fcc422a876d5d0d0daa7d4f3 Mon Sep 17 00:00:00 2001 From: Martin George Date: Thu, 16 Mar 2023 17:20:09 +0530 Subject: nvme: send Identify with CNS 06h only to I/O controllers Identify CNS 06h (I/O Command Set Specific Identify Controller data structure) is supported only on i/o controllers. But nvme_init_non_mdts_limits() currently invokes this on all controllers. Correct this by ensuring this is sent to I/O controllers only. Signed-off-by: Martin George Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d4be525f8100..53ef028596c6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3063,7 +3063,8 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) else ctrl->max_zeroes_sectors = 0; - if (nvme_ctrl_limited_cns(ctrl)) + if (ctrl->subsys->subtype != NVME_NQN_NVME || + nvme_ctrl_limited_cns(ctrl)) return 0; id = kzalloc(sizeof(*id), GFP_KERNEL); -- cgit v1.2.3 From 1231363aec86704a6b0467a12e3ca7bdf890e01d Mon Sep 17 00:00:00 2001 From: Juraj Pecigos Date: Sun, 26 Mar 2023 11:29:49 +0200 Subject: nvme-pci: mark Lexar NM760 as IGNORE_DEV_SUBNQN A system with more than one of these SSDs will only have one usable. The kernel fails to detect more than one nvme device due to duplicate cntlids. before: [ 9.395229] nvme 0000:01:00.0: platform quirk: setting simple suspend [ 9.395262] nvme nvme0: pci function 0000:01:00.0 [ 9.395282] nvme 0000:03:00.0: platform quirk: setting simple suspend [ 9.395305] nvme nvme1: pci function 0000:03:00.0 [ 9.409873] nvme nvme0: Duplicate cntlid 1 with nvme1, subsys nqn.2022-07.com.siliconmotion:nvm-subsystem-sn- , rejecting [ 9.409982] nvme nvme0: Removing after probe failure status: -22 [ 9.427487] nvme nvme1: allocated 64 MiB host memory buffer. [ 9.445088] nvme nvme1: 16/0/0 default/read/poll queues [ 9.449898] nvme nvme1: Ignoring bogus Namespace Identifiers after: [ 1.161890] nvme 0000:01:00.0: platform quirk: setting simple suspend [ 1.162660] nvme nvme0: pci function 0000:01:00.0 [ 1.162684] nvme 0000:03:00.0: platform quirk: setting simple suspend [ 1.162707] nvme nvme1: pci function 0000:03:00.0 [ 1.191354] nvme nvme0: allocated 64 MiB host memory buffer. [ 1.193378] nvme nvme1: allocated 64 MiB host memory buffer. [ 1.211044] nvme nvme1: 16/0/0 default/read/poll queues [ 1.211080] nvme nvme0: 16/0/0 default/read/poll queues [ 1.216145] nvme nvme0: Ignoring bogus Namespace Identifiers [ 1.216261] nvme nvme1: Ignoring bogus Namespace Identifiers Adding the NVME_QUIRK_IGNORE_DEV_SUBNQN quirk to resolves the issue. Signed-off-by: Juraj Pecigos Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b615906263f3..282d808400c5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3441,7 +3441,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ - .driver_data = NVME_QUIRK_BOGUS_NID, }, + .driver_data = NVME_QUIRK_BOGUS_NID | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), -- cgit v1.2.3 From 88eaba80328b31ef81813a1207b4056efd7006a6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 20 Mar 2023 15:33:34 +0200 Subject: nvme-tcp: fix a possible UAF when failing to allocate an io queue When we allocate a nvme-tcp queue, we set the data_ready callback before we actually need to use it. This creates the potential that if a stray controller sends us data on the socket before we connect, we can trigger the io_work and start consuming the socket. In this case reported: we failed to allocate one of the io queues, and as we start releasing the queues that we already allocated, we get a UAF [1] from the io_work which is running before it should really. Fix this by setting the socket ops callbacks only before we start the queue, so that we can't accidentally schedule the io_work in the initialization phase before the queue started. While we are at it, rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops. [1]: [16802.107284] nvme nvme4: starting error recovery [16802.109166] nvme nvme4: Reconnecting in 10 seconds... [16812.173535] nvme nvme4: failed to connect socket: -111 [16812.173745] nvme nvme4: Failed reconnect attempt 1 [16812.173747] nvme nvme4: Reconnecting in 10 seconds... [16822.413555] nvme nvme4: failed to connect socket: -111 [16822.413762] nvme nvme4: Failed reconnect attempt 2 [16822.413765] nvme nvme4: Reconnecting in 10 seconds... [16832.661274] nvme nvme4: creating 32 I/O queues. [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088 [16833.920068] nvme nvme4: Failed reconnect attempt 3 [16833.920094] #PF: supervisor write access in kernel mode [16833.920261] nvme nvme4: Reconnecting in 10 seconds... [16833.920368] #PF: error_code(0x0002) - not-present page [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30 ... [16833.923138] Call Trace: [16833.923271] [16833.923402] lock_sock_nested+0x1e/0x50 [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp] [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp] [16833.923824] process_one_work+0x1e8/0x390 [16833.923969] worker_thread+0x53/0x3d0 [16833.924104] ? process_one_work+0x390/0x390 [16833.924240] kthread+0x124/0x150 [16833.924376] ? set_kthread_struct+0x50/0x50 [16833.924518] ret_from_fork+0x1f/0x30 [16833.924655] Reported-by: Yanjun Zhang Signed-off-by: Sagi Grimberg Tested-by: Yanjun Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 42c0598c31f2..49c9e7bc9116 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1620,22 +1620,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) if (ret) goto err_init_connect; - queue->rd_enabled = true; set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); - nvme_tcp_init_recv_ctx(queue); - - write_lock_bh(&queue->sock->sk->sk_callback_lock); - queue->sock->sk->sk_user_data = queue; - queue->state_change = queue->sock->sk->sk_state_change; - queue->data_ready = queue->sock->sk->sk_data_ready; - queue->write_space = queue->sock->sk->sk_write_space; - queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; - queue->sock->sk->sk_state_change = nvme_tcp_state_change; - queue->sock->sk->sk_write_space = nvme_tcp_write_space; -#ifdef CONFIG_NET_RX_BUSY_POLL - queue->sock->sk->sk_ll_usec = 1; -#endif - write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1655,7 +1640,7 @@ err_destroy_mutex: return ret; } -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue) { struct socket *sock = queue->sock; @@ -1670,7 +1655,7 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) { kernel_sock_shutdown(queue->sock, SHUT_RDWR); - nvme_tcp_restore_sock_calls(queue); + nvme_tcp_restore_sock_ops(queue); cancel_work_sync(&queue->io_work); } @@ -1688,21 +1673,42 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_unlock(&queue->queue_lock); } +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) +{ + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; +#ifdef CONFIG_NET_RX_BUSY_POLL + queue->sock->sk->sk_ll_usec = 1; +#endif + write_unlock_bh(&queue->sock->sk->sk_callback_lock); +} + static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[idx]; int ret; + queue->rd_enabled = true; + nvme_tcp_init_recv_ctx(queue); + nvme_tcp_setup_sock_ops(queue); + if (idx) ret = nvmf_connect_io_queue(nctrl, idx); else ret = nvmf_connect_admin_queue(nctrl); if (!ret) { - set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + set_bit(NVME_TCP_Q_LIVE, &queue->flags); } else { - if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags)) - __nvme_tcp_stop_queue(&ctrl->queues[idx]); + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_stop_queue(queue); dev_err(nctrl->device, "failed to connect queue: %d ret=%d\n", idx, ret); } -- cgit v1.2.3