From 2b3f056f72e56fa07df69b4705e0b46a6c08e77c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2022 15:57:17 +0200 Subject: blk-mq: move the call to blk_put_queue out of blk_mq_destroy_queue The fact that blk_mq_destroy_queue also drops a queue reference leads to various places having to grab an extra reference. Move the call to blk_put_queue into the callers to allow removing the extra references. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20221018135720.670094-2-hch@lst.de [axboe: fix fabrics_q vs admin_q conflict in nvme core.c] Signed-off-by: Jens Axboe --- drivers/nvme/host/apple.c | 1 + drivers/nvme/host/core.c | 10 ++++++++-- drivers/nvme/host/pci.c | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index ff8b083dc5c6..7b8068aab7fc 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1510,6 +1510,7 @@ static int apple_nvme_probe(struct platform_device *pdev) if (!blk_get_queue(anv->ctrl.admin_q)) { nvme_start_admin_queue(&anv->ctrl); blk_mq_destroy_queue(anv->ctrl.admin_q); + blk_put_queue(anv->ctrl.admin_q); anv->ctrl.admin_q = NULL; ret = -ENODEV; goto put_dev; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc4220600585..0090dc0b3ae6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4851,6 +4851,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); + blk_put_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; @@ -4860,8 +4861,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) { blk_mq_destroy_queue(ctrl->admin_q); - if (ctrl->ops->flags & NVME_F_FABRICS) + blk_put_queue(ctrl->admin_q); + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); + blk_put_queue(ctrl->fabrics_q); + } blk_mq_free_tag_set(ctrl->admin_tagset); } EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); @@ -4907,8 +4911,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) { - if (ctrl->ops->flags & NVME_F_FABRICS) + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->connect_q); + blk_put_queue(ctrl->connect_q); + } blk_mq_free_tag_set(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 31e577b01257..7d37c81fbe70 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1749,6 +1749,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) */ nvme_start_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); + blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } -- cgit v1.2.3 From 7dcebef90d35de13a326f765dd787538880566f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2022 15:57:19 +0200 Subject: nvme-pci: remove an extra queue reference Now that blk_mq_destroy_queue does not release the queue reference, there is no need for a second admin queue reference to be held by the nvme_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20221018135720.670094-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7d37c81fbe70..5f1d71ac0086 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1749,7 +1749,6 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) */ nvme_start_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); - blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } @@ -1778,11 +1777,6 @@ static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev) dev->ctrl.admin_q = NULL; return -ENOMEM; } - if (!blk_get_queue(dev->ctrl.admin_q)) { - nvme_dev_remove_admin(dev); - dev->ctrl.admin_q = NULL; - return -ENODEV; - } return 0; } -- cgit v1.2.3 From 941f7298c70c7668416e7845fa76eb72c07d966b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2022 15:57:20 +0200 Subject: nvme-apple: remove an extra queue reference Now that blk_mq_destroy_queue does not release the queue reference, there is no need for a second admin queue reference to be held by the apple_nvme structure. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Sven Peter Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20221018135720.670094-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/apple.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 7b8068aab7fc..a89b06a7099f 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1507,15 +1507,6 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - if (!blk_get_queue(anv->ctrl.admin_q)) { - nvme_start_admin_queue(&anv->ctrl); - blk_mq_destroy_queue(anv->ctrl.admin_q); - blk_put_queue(anv->ctrl.admin_q); - anv->ctrl.admin_q = NULL; - ret = -ENODEV; - goto put_dev; - } - nvme_reset_ctrl(&anv->ctrl); async_schedule(apple_nvme_async_probe, anv); -- cgit v1.2.3 From 71b26083d59cd4ab22489829ffe7d4ead93f5546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:37 +0100 Subject: block: set the disk capacity to 0 in blk_mark_disk_dead nvme and xen-blkfront are already doing this to stop buffered writes from creating dirty pages that can't be written out later. Move it to the common code. This also removes the comment about the ordering from nvme, as bd_mutex not only is gone entirely, but also hasn't been used for locking updates to the disk size long before that, and thus the ordering requirement documented there doesn't apply any more. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++++ drivers/block/xen-blkfront.c | 1 - drivers/nvme/host/core.c | 7 +------ 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/block/genhd.c b/block/genhd.c index 493b93faee9c..e7bd036024fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -555,6 +555,11 @@ void blk_mark_disk_dead(struct gendisk *disk) { set_bit(GD_DEAD, &disk->state); blk_queue_start_drain(disk->queue); + + /* + * Stop buffered writers from dirtying pages that can't be written out. + */ + set_capacity_and_notify(disk, 0); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 35b9bcad9db9..b28489290323 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info) if (info->rq && info->gd) { blk_mq_stop_hw_queues(info->rq); blk_mark_disk_dead(info->gd); - set_capacity(info->gd, 0); } for_each_rinfo(info, rinfo, i) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0090dc0b3ae6..aea0f89acf40 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5116,10 +5116,7 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) /* * Prepare a queue for teardown. * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. + * This must forcibly unquiesce queues to avoid blocking dispatch. */ static void nvme_set_queue_dying(struct nvme_ns *ns) { @@ -5128,8 +5125,6 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) blk_mark_disk_dead(ns->disk); nvme_start_ns_queue(ns); - - set_capacity_and_notify(ns->disk, 0); } /** -- cgit v1.2.3 From 0ffc7e98bfaa45380b800deeb9b65ce0371c652d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:38 +0100 Subject: nvme-pci: refactor the tagset handling in nvme_reset_work The code to create, update or delete a tagset and namespaces in nvme_reset_work is a bit convoluted. Refactor it with a two high-level conditionals for first probe vs reset and I/O queues vs no I/O queues to make the code flow more clear. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-3-hch@lst.de [axboe: fix whitespace issue] Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5f1d71ac0086..6ab7be07b727 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2898,24 +2898,36 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - /* - * Keep the controller around but remove all namespaces if we don't have - * any working I/O queue. - */ - if (dev->online_queues < 2) { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - nvme_kill_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); + if (dev->ctrl.tagset) { + /* + * This is a controller reset and we already have a tagset. + * Freeze and update the number of I/O queues as thos might have + * changed. If there are no I/O queues left after this reset, + * keep the controller around but remove all namespaces. + */ + if (dev->online_queues > 1) { + nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); + nvme_pci_update_nr_queues(dev); + nvme_dbbuf_set(dev); + nvme_unfreeze(&dev->ctrl); + } else { + dev_warn(dev->ctrl.device, "IO queues lost\n"); + nvme_kill_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); + } } else { - nvme_start_queues(&dev->ctrl); - nvme_wait_freeze(&dev->ctrl); - if (!dev->ctrl.tagset) + /* + * First probe. Still allow the controller to show up even if + * there are no namespaces. + */ + if (dev->online_queues > 1) { nvme_pci_alloc_tag_set(dev); - else - nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); - nvme_unfreeze(&dev->ctrl); + nvme_dbbuf_set(dev); + } else { + dev_warn(dev->ctrl.device, "IO queues not created\n"); + } } /* -- cgit v1.2.3 From 23a908647efade186576c9628dd7bb560f6e759b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:39 +0100 Subject: nvme: don't remove namespaces in nvme_passthru_end The call to nvme_remove_invalid_namespaces made sense when nvme_passthru_end revalidated all namespaces and had to remove those that didn't exist any more. Since we don't revalidate from nvme_passthru_end now, this call is entirely spurious. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index aea0f89acf40..35386aa24f58 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1118,7 +1118,6 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) -- cgit v1.2.3 From 4f17344e9daeb6e9f89976d811a5373710ed1f04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:40 +0100 Subject: nvme: remove the NVME_NS_DEAD check in nvme_remove_invalid_namespaces The NVME_NS_DEAD check only made sense when we revalidated namespaces in nvme_passthrough_end for commands that affected the namespace inventory. These days NVME_NS_DEAD is only set during reset or when tearing down namespaces, and we always remove all namespaces right after that. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 35386aa24f58..390f2a2db323 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4407,7 +4407,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) + if (ns->head->ns_id > nsid) list_move_tail(&ns->list, &rm_list); } up_write(&ctrl->namespaces_rwsem); -- cgit v1.2.3 From fde776afdd8467a09395a7aebdb2499f86315945 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:41 +0100 Subject: nvme: remove the NVME_NS_DEAD check in nvme_validate_ns At the point where namespaces are marked dead, the controller is in a non-live state and we won't get pass the identify commands. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 390f2a2db323..871a8ab7ec19 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4333,10 +4333,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) { int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; - if (test_bit(NVME_NS_DEAD, &ns->flags)) - goto out; - - ret = NVME_SC_INVALID_NS | NVME_SC_DNR; if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) { dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); -- cgit v1.2.3 From 6bcd5089ee1302e9ad7072ca0866f0c5a1158359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:42 +0100 Subject: nvme: don't unquiesce the admin queue in nvme_kill_queues None of the callers of nvme_kill_queues needs it to unquiesce the admin queues, as all of them already do it themselves: 1) nvme_reset_work explicit call nvme_start_admin_queue toward the beginning of the function. The extra call to nvme_start_admin_queue in nvme_reset_work this won't do anything as NVME_CTRL_ADMIN_Q_STOPPED will already be cleared. 2) nvme_remove calls nvme_dev_disable with shutdown flag set to true at the very beginning of the function if the PCIe device was not present, which is the precondition for the call to nvme_kill_queues. nvme_dev_disable already calls nvme_start_admin_queue toward the end of the function when the shutdown flag is set to true, so the admin queue is already enabled at this point. 3) nvme_remove_dead_ctrl schedules a workqueue to unbind the driver, which will end up in nvme_remove, which calls nvme_dev_disable with the shutdown flag. This case will call nvme_start_admin_queue a bit later than before. 4) apple_nvme_remove uses the same sequence as nvme_remove_dead_ctrl above. 5) nvme_remove_namespaces only calls nvme_kill_queues when the controller is in the DEAD state. That can only happen in the PCIe driver, and only from nvme_remove. See item 2) above for the conditions there. So it is safe to just remove the call to nvme_start_admin_queue in nvme_kill_queues without replacement. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 871a8ab7ec19..bb62803de5b2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5135,10 +5135,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); - /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - nvme_start_admin_queue(ctrl); - list_for_each_entry(ns, &ctrl->namespaces, list) nvme_set_queue_dying(ns); -- cgit v1.2.3 From cd50f9b24726e9e195a0682c8d8d952396d57aef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:43 +0100 Subject: nvme: split nvme_kill_queues nvme_kill_queues does two things: 1) mark the gendisk of all namespaces dead 2) unquiesce all I/O queues These used to be be intertwined due to block layer issues, but aren't any more. So move the unquiscing of the I/O queues into the callers, and rename the rest of the function to the now more descriptive nvme_mark_namespaces_dead. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/apple.c | 3 ++- drivers/nvme/host/core.c | 36 ++++++++---------------------------- drivers/nvme/host/nvme.h | 3 +-- drivers/nvme/host/pci.c | 6 ++++-- 4 files changed, 15 insertions(+), 33 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index a89b06a7099f..6c09703ffe92 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1153,7 +1153,8 @@ out: nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING); nvme_get_ctrl(&anv->ctrl); apple_nvme_disable(anv, false); - nvme_kill_queues(&anv->ctrl); + nvme_mark_namespaces_dead(&anv->ctrl); + nvme_start_queues(&anv->ctrl); if (!queue_work(nvme_wq, &anv->remove_work)) nvme_put_ctrl(&anv->ctrl); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb62803de5b2..ed06fcb87f93 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4561,8 +4561,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) * removing the namespaces' disks; fail all the queues now to avoid * potentially having to clean up the failed sync later. */ - if (ctrl->state == NVME_CTRL_DEAD) - nvme_kill_queues(ctrl); + if (ctrl->state == NVME_CTRL_DEAD) { + nvme_mark_namespaces_dead(ctrl); + nvme_start_queues(ctrl); + } /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); @@ -5108,39 +5110,17 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) blk_mq_wait_quiesce_done(ns->queue); } -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_mark_disk_dead(ns->disk); - nvme_start_ns_queue(ns); -} - -/** - * nvme_kill_queues(): Ends all namespace queues - * @ctrl: the dead controller that needs to end - * - * Call this function when the driver determines it is unable to get the - * controller in a state capable of servicing IO. - */ -void nvme_kill_queues(struct nvme_ctrl *ctrl) +/* let I/O to all namespaces fail in preparation for surprise removal */ +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_set_queue_dying(ns); - + blk_mark_disk_dead(ns->disk); up_read(&ctrl->namespaces_rwsem); } -EXPORT_SYMBOL_GPL(nvme_kill_queues); +EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a29877217ee6..1cd716800243 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -483,7 +483,6 @@ struct nvme_ns { unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 #define NVME_NS_READY 4 @@ -758,7 +757,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); void nvme_start_admin_queue(struct nvme_ctrl *ctrl); -void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6ab7be07b727..f72143945ab5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2788,7 +2788,8 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); - nvme_kill_queues(&dev->ctrl); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_start_queues(&dev->ctrl); if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -2913,7 +2914,8 @@ static void nvme_reset_work(struct work_struct *work) nvme_unfreeze(&dev->ctrl); } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); - nvme_kill_queues(&dev->ctrl); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_start_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_free_tagset(dev); } -- cgit v1.2.3 From bad3e021ae2bb5ac9d650c9a04788efe753367f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:44 +0100 Subject: nvme-pci: don't unquiesce the I/O queues in nvme_remove_dead_ctrl nvme_remove_dead_ctrl schedules nvme_remove to be called, which will call nvme_dev_disable and unquiesce the I/O queues. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f72143945ab5..208c387f1558 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2789,7 +2789,6 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev) nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); nvme_mark_namespaces_dead(&dev->ctrl); - nvme_start_queues(&dev->ctrl); if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } -- cgit v1.2.3 From 2b4c2355c5e155cdf341d9ce2c2355b4b26c32c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:45 +0100 Subject: nvme-apple: don't unquiesce the I/O queues in apple_nvme_reset_work apple_nvme_reset_work schedules apple_nvme_remove, to be called, which will call apple_nvme_disable and unquiesce the I/O queues. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/apple.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 6c09703ffe92..24e224c279a4 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1154,7 +1154,6 @@ out: nvme_get_ctrl(&anv->ctrl); apple_nvme_disable(anv, false); nvme_mark_namespaces_dead(&anv->ctrl); - nvme_start_queues(&anv->ctrl); if (!queue_work(nvme_wq, &anv->remove_work)) nvme_put_ctrl(&anv->ctrl); } -- cgit v1.2.3 From 483239c75ba768e0e2c0e0c503e5fc13c3d5773a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:48 +0100 Subject: blk-mq: pass a tagset to blk_mq_wait_quiesce_done Nothing in blk_mq_wait_quiesce_done needs the request_queue now, so just pass the tagset, and move the non-mq check into the only caller that needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 +++++++++------- drivers/nvme/host/core.c | 4 ++-- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) (limited to 'drivers/nvme') diff --git a/block/blk-mq.c b/block/blk-mq.c index bee728dac9cd..b7abfda1ea69 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done - * @q: request queue. + * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has - * been started. + * been started on or more of the request_queues of the tag_set. This + * function only waits for the quiesce on those request_queues that had + * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ -void blk_mq_wait_quiesce_done(struct request_queue *q) +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { - if (q->tag_set->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(q->tag_set->srcu); + if (set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(set->srcu); else synchronize_rcu(); } @@ -282,7 +284,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) - blk_mq_wait_quiesce_done(q); + blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -1623,7 +1625,7 @@ static void blk_mq_timeout_work(struct work_struct *work) * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ - blk_mq_wait_quiesce_done(q); + blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ed06fcb87f93..66b0b6e11002 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5107,7 +5107,7 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) blk_mq_quiesce_queue(ns->queue); else - blk_mq_wait_quiesce_done(ns->queue); + blk_mq_wait_quiesce_done(ns->queue->tag_set); } /* let I/O to all namespaces fail in preparation for surprise removal */ @@ -5197,7 +5197,7 @@ void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else - blk_mq_wait_quiesce_done(ctrl->admin_q); + blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 8b89fab7c420..249757ddd8fe 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2735,7 +2735,7 @@ static void scsi_stop_queue(struct scsi_device *sdev, bool nowait) blk_mq_quiesce_queue(sdev->request_queue); } else { if (!nowait) - blk_mq_wait_quiesce_done(sdev->request_queue); + blk_mq_wait_quiesce_done(sdev->request_queue->tag_set); } } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f059edebb11d..061ea6e7af01 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -880,7 +880,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); -void blk_mq_wait_quiesce_done(struct request_queue *q); +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -- cgit v1.2.3 From 98d81f0df70ce6fc48517d938026e3c684b9051a Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Tue, 1 Nov 2022 16:00:50 +0100 Subject: nvme: use blk_mq_[un]quiesce_tagset All controller namespaces share the same tagset, so we can use this interface which does the optimal operation for parallel quiesce based on the tagset type(e.g. blocking tagsets and non-blocking tagsets). nvme connect_q should not be quiesced when quiesce tagset, so set the QUEUE_FLAG_SKIP_TAGSET_QUIESCE to skip it when init connect_q. Currently we use NVME_NS_STOPPED to ensure pairing quiescing and unquiescing. If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be invalided, so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED. In addition, we never really quiesce a single namespace. It is a better choice to move the flag from ns to ctrl. Signed-off-by: Chao Leng [hch: rebased on top of prep patches] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 34 ++++++++-------------------------- drivers/nvme/host/nvme.h | 2 +- 2 files changed, 9 insertions(+), 27 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 66b0b6e11002..f94b05c585cb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4895,6 +4895,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } + blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, + ctrl->connect_q); } ctrl->tagset = set; @@ -5096,20 +5098,6 @@ out: } EXPORT_SYMBOL_GPL(nvme_init_ctrl); -static void nvme_start_ns_queue(struct nvme_ns *ns) -{ - if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_unquiesce_queue(ns->queue); -} - -static void nvme_stop_ns_queue(struct nvme_ns *ns) -{ - if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_quiesce_queue(ns->queue); - else - blk_mq_wait_quiesce_done(ns->queue->tag_set); -} - /* let I/O to all namespaces fail in preparation for surprise removal */ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { @@ -5172,23 +5160,17 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze); void nvme_stop_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_stop_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_quiesce_tagset(ctrl->tagset); + else + blk_mq_wait_quiesce_done(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_stop_queues); void nvme_start_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_start_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_tagset(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_start_queues); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1cd716800243..f9df10653f3c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -237,6 +237,7 @@ enum nvme_ctrl_flags { NVME_CTRL_FAILFAST_EXPIRED = 0, NVME_CTRL_ADMIN_Q_STOPPED = 1, NVME_CTRL_STARTED_ONCE = 2, + NVME_CTRL_STOPPED = 3, }; struct nvme_ctrl { @@ -486,7 +487,6 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 #define NVME_NS_READY 4 -#define NVME_NS_STOPPED 5 struct cdev cdev; struct device cdev_device; -- cgit v1.2.3 From bbf5410bc69e131c82ad970ce7ee28b5906a6cc5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 20 Oct 2022 17:35:40 +0200 Subject: nvmet: use try_cmpxchg in nvmet_update_sq_head Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in nvmet_update_sq_head. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Signed-off-by: Uros Bizjak Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index aecb5853f8da..560b6bbb6c44 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -695,11 +695,10 @@ static void nvmet_update_sq_head(struct nvmet_req *req) if (req->sq->size) { u32 old_sqhd, new_sqhd; + old_sqhd = READ_ONCE(req->sq->sqhd); do { - old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; - } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != - old_sqhd); + } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd)); } req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); } -- cgit v1.2.3 From 2be2cd5287152a6284b45244b6e5c2f7e0a218bd Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 25 Oct 2022 17:50:08 +0200 Subject: nvmet: force reconnect when number of queue changes In order to test queue number changes we need to make sure that the host reconnects. Because only when the host disconnects from the target the number of queues are allowed to change according the spec. The initial idea was to disable and re-enable the ports and have the host wait until the KATO timer expires, triggering error recovery. Though the host would see a DNR reply when trying to reconnect. Because of the DNR bit the connection is dropped completely. There is no point in trying to reconnect with the same parameters according the spec. We can force to reconnect the host is by deleting all controllers. The host will observe any newly posted request to fail and thus starts the error recovery but this time without the DNR bit set. Signed-off-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9443ee1d4ae3..051a420d818e 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1290,6 +1290,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, const char *page, size_t cnt) { + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_ctrl *ctrl; u16 qid_max; if (sscanf(page, "%hu\n", &qid_max) != 1) @@ -1299,8 +1301,13 @@ static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, return -EINVAL; down_write(&nvmet_config_sem); - to_subsys(item)->max_qid = qid_max; + subsys->max_qid = qid_max; + + /* Force reconnect */ + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + ctrl->ops->delete_ctrl(ctrl); up_write(&nvmet_config_sem); + return cnt; } CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); -- cgit v1.2.3 From fa8f9ac42350edd3ce82d0d148a60f0fa088f995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Nov 2022 14:01:24 +0100 Subject: nvmet: only allocate a single slab for bvecs There is no need to have a separate slab cache for each namespace, and having separate ones creates duplicate debugs file names as well. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- drivers/nvme/target/core.c | 22 ++++++++++++++-------- drivers/nvme/target/io-cmd-file.c | 16 +++------------- drivers/nvme/target/nvmet.h | 3 ++- 3 files changed, 19 insertions(+), 22 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 560b6bbb6c44..171493bcf0fb 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -15,6 +15,7 @@ #include "nvmet.h" +struct kmem_cache *nvmet_bvec_cache; struct workqueue_struct *buffered_io_wq; struct workqueue_struct *zbd_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; @@ -1630,26 +1631,28 @@ void nvmet_subsys_put(struct nvmet_subsys *subsys) static int __init nvmet_init(void) { - int error; + int error = -ENOMEM; nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + nvmet_bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nvmet_bvec_cache) + return -ENOMEM; + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); if (!zbd_wq) - return -ENOMEM; + goto out_destroy_bvec_cache; buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); - if (!buffered_io_wq) { - error = -ENOMEM; + if (!buffered_io_wq) goto out_free_zbd_work_queue; - } nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0); - if (!nvmet_wq) { - error = -ENOMEM; + if (!nvmet_wq) goto out_free_buffered_work_queue; - } error = nvmet_init_discovery(); if (error) @@ -1668,6 +1671,8 @@ out_free_buffered_work_queue: destroy_workqueue(buffered_io_wq); out_free_zbd_work_queue: destroy_workqueue(zbd_wq); +out_destroy_bvec_cache: + kmem_cache_destroy(nvmet_bvec_cache); return error; } @@ -1679,6 +1684,7 @@ static void __exit nvmet_exit(void) destroy_workqueue(nvmet_wq); destroy_workqueue(buffered_io_wq); destroy_workqueue(zbd_wq); + kmem_cache_destroy(nvmet_bvec_cache); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 64b47e2a4633..e55ec6fefd7f 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -11,7 +11,6 @@ #include #include "nvmet.h" -#define NVMET_MAX_MPOOL_BVEC 16 #define NVMET_MIN_MPOOL_OBJ 16 void nvmet_file_ns_revalidate(struct nvmet_ns *ns) @@ -26,8 +25,6 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) flush_workqueue(buffered_io_wq); mempool_destroy(ns->bvec_pool); ns->bvec_pool = NULL; - kmem_cache_destroy(ns->bvec_cache); - ns->bvec_cache = NULL; fput(ns->file); ns->file = NULL; } @@ -59,16 +56,8 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) ns->blksize_shift = min_t(u8, file_inode(ns->file)->i_blkbits, 12); - ns->bvec_cache = kmem_cache_create("nvmet-bvec", - NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ns->bvec_cache) { - ret = -ENOMEM; - goto err; - } - ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab, - mempool_free_slab, ns->bvec_cache); + mempool_free_slab, nvmet_bvec_cache); if (!ns->bvec_pool) { ret = -ENOMEM; @@ -77,9 +66,10 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) return ret; err: + fput(ns->file); + ns->file = NULL; ns->size = 0; ns->blksize_shift = 0; - nvmet_file_ns_disable(ns); return ret; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dfe3894205aa..bda1c1f71f39 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -77,7 +77,6 @@ struct nvmet_ns { struct completion disable_done; mempool_t *bvec_pool; - struct kmem_cache *bvec_cache; int use_p2pmem; struct pci_dev *p2p_dev; @@ -393,6 +392,8 @@ struct nvmet_req { u64 error_slba; }; +#define NVMET_MAX_MPOOL_BVEC 16 +extern struct kmem_cache *nvmet_bvec_cache; extern struct workqueue_struct *buffered_io_wq; extern struct workqueue_struct *zbd_wq; extern struct workqueue_struct *nvmet_wq; -- cgit v1.2.3 From cf3d00840170ebf372bcacc5d5c27f5ed9c1b976 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 2 Oct 2022 11:59:45 +0200 Subject: nvme-fc: improve memory usage in nvme_fc_rcv_ls_req() sizeof( struct nvmefc_ls_rcv_op ) = 64 sizeof( union nvmefc_ls_requests ) = 1024 sizeof( union nvmefc_ls_responses ) = 128 So, in nvme_fc_rcv_ls_req(), 1216 bytes of memory are requested when kzalloc() is called. Because of the way memory allocations are performed, 2048 bytes are allocated. So about 800 bytes are wasted for each request. Switch to 3 distinct memory allocations, in order to: - save these 800 bytes - avoid zeroing this extra memory - make sure that memory is properly aligned in case of DMA access ("fc_dma_map_single(lsop->rspbuf)" just a few lines below) Signed-off-by: Christophe JAILLET Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5d57a042dbca..2d3c54838496 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); nvme_fc_rport_put(rport); @@ -1751,20 +1753,17 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, goto out_put; } - lsop = kzalloc(sizeof(*lsop) + - sizeof(union nvmefc_ls_requests) + - sizeof(union nvmefc_ls_responses), - GFP_KERNEL); - if (!lsop) { + lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); + lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); + lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); + if (!lsop || !lsop->rqstbuf || !lsop->rspbuf) { dev_info(lport->dev, "RCV %s LS failed: No memory\n", (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? nvmefc_ls_names[w0->ls_cmd] : ""); ret = -ENOMEM; - goto out_put; + goto out_free; } - lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; - lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, sizeof(*lsop->rspbuf), @@ -1801,6 +1800,8 @@ out_unmap: fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); out_free: + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); out_put: nvme_fc_rport_put(rport); -- cgit v1.2.3 From 855b7717f44b13e0990aa5ad36bbf9aa35051516 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 31 Oct 2022 21:53:50 +0530 Subject: nvme: fine-granular CAP_SYS_ADMIN for nvme io commands Currently both io and admin commands are kept under a coarse-granular CAP_SYS_ADMIN check, disregarding file mode completely. $ ls -l /dev/ng* crw-rw-rw- 1 root root 242, 0 Sep 9 19:20 /dev/ng0n1 crw------- 1 root root 242, 1 Sep 9 19:20 /dev/ng0n2 In the example above, ng0n1 appears as if it may allow unprivileged read/write operation but it does not and behaves same as ng0n2. This patch implements a shift from CAP_SYS_ADMIN to more fine-granular control for io-commands. If CAP_SYS_ADMIN is present, nothing else is checked as before. Otherwise, following rules are in place - any admin-cmd is not allowed - vendor-specific and fabric commmand are not allowed - io-commands that can write are allowed if matching FMODE_WRITE permission is present - io-commands that read are allowed Add a helper nvme_cmd_allowed that implements above policy. Change all the callers of CAP_SYS_ADMIN to go through nvme_cmd_allowed for any decision making. Since file open mode is counted for any approval/denial, change at various places to keep file-mode information handy. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 102 +++++++++++++++++++++++++++++++--------------- include/linux/nvme.h | 1 + 2 files changed, 70 insertions(+), 33 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81f5550b670d..1d68f161064a 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -8,6 +8,34 @@ #include #include "nvme.h" +static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, + fmode_t mode) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + + /* + * Do not allow unprivileged processes to send vendor specific or fabrics + * commands as we can't be sure about their effects. + */ + if (c->common.opcode >= nvme_cmd_vendor_start || + c->common.opcode == nvme_fabrics_command) + return false; + + /* do not allow unprivileged admin commands */ + if (!ns) + return false; + + /* + * Only allow I/O commands that transfer data to the controller if the + * special file is open for writing, but always allow I/O commands that + * transfer data from the controller. + */ + if (nvme_is_write(c)) + return mode & FMODE_WRITE; + return true; +} + /* * Convert integer values from ioctl structures to user pointers, silently * ignoring the upper bits in the compat case to match behaviour of 32-bit @@ -261,7 +289,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) + struct nvme_passthru_cmd __user *ucmd, fmode_t mode) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -269,8 +297,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u64 result; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -291,6 +317,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -308,15 +337,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd, bool vec) + struct nvme_passthru_cmd64 __user *ucmd, bool vec, + fmode_t mode) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -337,6 +365,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -483,9 +514,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, void *meta = NULL; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - c.common.opcode = READ_ONCE(cmd->opcode); c.common.flags = READ_ONCE(cmd->flags); if (c.common.flags) @@ -507,6 +535,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode)) + return -EACCES; + d.metadata = READ_ONCE(cmd->metadata); d.addr = READ_ONCE(cmd->addr); d.data_len = READ_ONCE(cmd->data_len); @@ -570,13 +601,13 @@ static bool is_ctrl_ioctl(unsigned int cmd) } static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, mode); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -601,14 +632,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp); + return nvme_user_cmd(ns->ctrl, ns, argp, mode); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -620,19 +651,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, false); + return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode); case NVME_IOCTL_IO64_CMD_VEC: - return nvme_user_cmd64(ns->ctrl, ns, argp, true); + return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode); default: return -ENOTTY; } } -static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg, + fmode_t mode) { - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); - return nvme_ns_ioctl(ns, cmd, arg); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode); + return nvme_ns_ioctl(ns, cmd, arg, mode); } int nvme_ioctl(struct block_device *bdev, fmode_t mode, @@ -640,7 +672,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, { struct nvme_ns *ns = bdev->bd_disk->private_data; - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, mode); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -648,7 +680,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -716,7 +748,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, } #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, struct nvme_ns_head *head, int srcu_idx) + void __user *argp, struct nvme_ns_head *head, int srcu_idx, + fmode_t mode) __releases(&head->srcu) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -724,7 +757,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, nvme_get_ctrl(ns->ctrl); srcu_read_unlock(&head->srcu, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); nvme_put_ctrl(ctrl); return ret; @@ -749,9 +782,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -773,9 +807,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, goto out_unlock; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + file->f_mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -849,7 +884,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) return ret; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, + fmode_t mode) { struct nvme_ns *ns; int ret; @@ -873,7 +909,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp); + ret = nvme_user_cmd(ctrl, ns, argp, mode); nvme_put_ns(ns); return ret; @@ -890,11 +926,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, file->f_mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode); case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); + return nvme_dev_user_cmd(ctrl, argp, file->f_mode); case NVME_IOCTL_RESET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 050d7d0cd81b..1d102b662e88 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -797,6 +797,7 @@ enum nvme_opcode { nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, nvme_cmd_zone_append = 0x7d, + nvme_cmd_vendor_start = 0x80, }; #define nvme_opcode_name(opcode) { opcode, #opcode } -- cgit v1.2.3 From e4fbcf32c860f98103ca7f1dc8c0dc69e2219ec6 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 1 Nov 2022 10:43:07 +0100 Subject: nvme: identify-namespace without CAP_SYS_ADMIN Allow all identify-namespace variants (CNS 00h, 05h and 08h) without requiring CAP_SYS_ADMIN. The information (retrieved using id-ns) is needed to form IO commands for passthrough interface. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 1d68f161064a..9550a69029b3 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -22,9 +22,23 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, c->common.opcode == nvme_fabrics_command) return false; - /* do not allow unprivileged admin commands */ - if (!ns) + /* + * Do not allow unprivileged passthrough of admin commands except + * for a subset of identify commands that contain information required + * to form proper I/O commands in userspace and do not expose any + * potentially sensitive information. + */ + if (!ns) { + if (c->common.opcode == nvme_admin_identify) { + switch (c->identify.cns) { + case NVME_ID_CNS_NS: + case NVME_ID_CNS_CS_NS: + case NVME_ID_CNS_NS_CS_INDEP: + return true; + } + } return false; + } /* * Only allow I/O commands that transfer data to the controller if the -- cgit v1.2.3 From 1b96f862ecccb3e6f950eba584bebf22955cecc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 16:50:15 +0100 Subject: nvme: implement the DEAC bit for the Write Zeroes command While the specification allows devices to either deallocate data or to actually write zeroes on any Write Zeroes command, many SSDs only do the sensible thing and deallocate data when the DEAC bit is specific. Set it when it is supported and the caller doesn't explicitly opt out of deallocation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- drivers/nvme/host/core.c | 13 ++++++++++++- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f94b05c585cb..1a87a072fbed 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -850,8 +850,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC)) + cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC); + if (nvme_ns_has_pi(ns)) { - cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE1: @@ -2003,6 +2006,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } } + /* + * Only set the DEAC bit if the device guarantees that reads from + * deallocated data return zeroes. While the DEAC bit does not + * require that, it must be a no-op if reads from deallocated data + * do not return zeroes. + */ + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + ns->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f9df10653f3c..16b34a491495 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -455,6 +455,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ + NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */ }; struct nvme_ns { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d102b662e88..d6be2a686100 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -964,6 +964,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_WZ_DEAC = 1 << 9, }; struct nvme_dsm_cmd { -- cgit v1.2.3 From 1e37a307f1481058da852accb37e0e1a3e137e9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:46:45 +0100 Subject: nvme: don't call nvme_init_ctrl_finish from nvme_passthru_end nvme_passthrough_end can race with a reset, which can lead to racing stores to the cels xarray as well as further shengians with upcoming more complicated initialization. So drop the call and just log that the controller capabilities might have changed and a reset could be required to use the new controller capabilities. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a87a072fbed..ce8314aee1dd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1123,8 +1123,10 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, mutex_unlock(&ctrl->subsys->lock); mutex_unlock(&ctrl->scan_lock); } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_ctrl_finish(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) { + dev_info(ctrl->device, +"controller capabilities changed, reset may be required to take effect.\n"); + } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); -- cgit v1.2.3 From 94cc781f69f49f665383dd87aef973b7896153d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:48:27 +0100 Subject: nvme: move OPAL setup from PCIe to core Nothing about the TCG Opal support is PCIe transport specific, so move it to the core code. For this nvme_init_ctrl_finish grows a new was_suspended argument that allows the transport driver to tell the OPAL code if the controller came out of a suspend cycle. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Tested-by Gerd Bayer --- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/core.c | 25 ++++++++++++++++++++++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 5 +---- drivers/nvme/host/pci.c | 14 +------------- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 8 files changed, 29 insertions(+), 25 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 24e224c279a4..a85349a7e938 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1102,7 +1102,7 @@ static void apple_nvme_reset_work(struct work_struct *work) goto out; } - ret = nvme_init_ctrl_finish(&anv->ctrl); + ret = nvme_init_ctrl_finish(&anv->ctrl, false); if (ret) goto out; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce8314aee1dd..aedacf2fba69 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2192,7 +2192,7 @@ const struct pr_ops nvme_pr_ops = { }; #ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, +static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct nvme_ctrl *ctrl = data; @@ -2209,7 +2209,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, NVME_QID_ANY, 1, 0); } -EXPORT_SYMBOL_GPL(nvme_sec_submit); + +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ + if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!ctrl->opal_dev) + ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit); + else if (was_suspended) + opal_unlock_from_suspend(ctrl->opal_dev); + } else { + free_opal_dev(ctrl->opal_dev); + ctrl->opal_dev = NULL; + } +} +#else +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ +} #endif /* CONFIG_BLK_SED_OPAL */ #ifdef CONFIG_BLK_DEV_ZONED @@ -3242,7 +3258,7 @@ out_free: * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) { int ret; @@ -3273,6 +3289,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + nvme_configure_opal(ctrl, was_suspended); + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { /* * Do not return errors unless we are in a controller reset, @@ -5007,6 +5025,7 @@ static void nvme_free_ctrl(struct device *dev) nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); + free_opal_dev(ctrl->opal_dev); if (subsys) { mutex_lock(&nvme_subsystems_lock); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2d3c54838496..1f9f4075794b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3107,7 +3107,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) nvme_start_admin_queue(&ctrl->ctrl); - ret = nvme_init_ctrl_finish(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 16b34a491495..306a120d49ab 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -736,7 +736,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int flags, unsigned int cmd_size); @@ -748,9 +748,6 @@ void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send); - void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 208c387f1558..e4f084e12b96 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2772,7 +2772,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); - free_opal_dev(dev->ctrl.opal_dev); mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); @@ -2866,21 +2865,10 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_integrity_segments = 1; - result = nvme_init_ctrl_finish(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { - if (!dev->ctrl.opal_dev) - dev->ctrl.opal_dev = - init_opal_dev(&dev->ctrl, &nvme_sec_submit); - else if (was_suspend) - opal_unlock_from_suspend(dev->ctrl.opal_dev); - } else { - free_opal_dev(dev->ctrl.opal_dev); - dev->ctrl.opal_dev = NULL; - } - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { result = nvme_dbbuf_dma_alloc(dev); if (result) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6e079abb22ee..ccd45e5b3298 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -871,7 +871,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_start_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1eed0fc26b3a..4f8584657bb7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1949,7 +1949,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) nvme_start_admin_queue(ctrl); - error = nvme_init_ctrl_finish(ctrl); + error = nvme_init_ctrl_finish(ctrl, false); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b45fe3adf015..893c50f365c4 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -377,7 +377,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) nvme_start_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_cleanup_tagset; -- cgit v1.2.3 From 86adbf0cdb9ec6533234696c3e243184d4d0d040 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Oct 2022 02:34:13 -0700 Subject: nvme: simplify transport specific device attribute handling Allow the transport driver to override the attribute groups for the control device, so that the PCIe driver doesn't manually have to add a group after device creation and keep track of it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/core.c | 8 ++++++-- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 23 ++++++++--------------- 3 files changed, 16 insertions(+), 17 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index aedacf2fba69..6040a13d3e2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3906,10 +3906,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_dev_attrs_group = { +const struct attribute_group nvme_dev_attrs_group = { .attrs = nvme_dev_attrs, .is_visible = nvme_dev_attrs_are_visible, }; +EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); static const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, @@ -5091,7 +5092,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; - ctrl->device->groups = nvme_dev_attr_groups; + if (ops->dev_attr_groups) + ctrl->device->groups = ops->dev_attr_groups; + else + ctrl->device->groups = nvme_dev_attr_groups; ctrl->device->release = nvme_free_ctrl; dev_set_drvdata(ctrl->device, ctrl); ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 306a120d49ab..924ff80d85f6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -508,6 +508,7 @@ struct nvme_ctrl_ops { unsigned int flags; #define NVME_F_FABRICS (1 << 0) #define NVME_F_METADATA_SUPPORTED (1 << 1) + const struct attribute_group **dev_attr_groups; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -854,6 +855,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; +extern const struct attribute_group nvme_dev_attrs_group; struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); #ifdef CONFIG_NVME_MULTIPATH diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e4f084e12b96..c8f6ce5eee1c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -158,8 +158,6 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; - - bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -2234,11 +2232,17 @@ static struct attribute *nvme_pci_attrs[] = { NULL, }; -static const struct attribute_group nvme_pci_attr_group = { +static const struct attribute_group nvme_pci_dev_attrs_group = { .attrs = nvme_pci_attrs, .is_visible = nvme_pci_attrs_are_visible, }; +static const struct attribute_group *nvme_pci_dev_attr_groups[] = { + &nvme_dev_attrs_group, + &nvme_pci_dev_attrs_group, + NULL, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2930,10 +2934,6 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group)) - dev->attrs_added = true; - nvme_start_ctrl(&dev->ctrl); return; @@ -3006,6 +3006,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED, + .dev_attr_groups = nvme_pci_dev_attr_groups, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -3204,13 +3205,6 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } -static void nvme_remove_attrs(struct nvme_dev *dev) -{ - if (dev->attrs_added) - sysfs_remove_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group); -} - /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3232,7 +3226,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); -- cgit v1.2.3 From 96ef1be53663a9343dffcf106e2f1b59da4b8799 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:10:21 +0100 Subject: nvme-pci: put the admin queue in nvme_dev_remove_admin Once the controller is shutdown no one can access the admin queue. Tear it down in nvme_dev_remove_admin, which matches the flow in the other drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c8f6ce5eee1c..f526ad578088 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1747,6 +1747,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) */ nvme_start_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); + blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } @@ -2774,8 +2775,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - if (dev->ctrl.admin_q) - blk_put_queue(dev->ctrl.admin_q); mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); -- cgit v1.2.3 From c11b7716d6c96e82d2b404c4520237d968357a0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:11:13 +0100 Subject: nvme-pci: move more teardown work to nvme_remove nvme_dbbuf_dma_free frees dma coherent memory, so it must not be called after ->remove has returned. Fortunately there is no way to use it after shutdown as no more I/O is possible so it can be moved. Similarly the iod_mempool can't be used for a device kept alive after shutdown, so move it next to freeing the PRP pools. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f526ad578088..b638f43f2df2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2773,9 +2773,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); kfree(dev); @@ -3227,7 +3225,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, true); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); + mempool_destroy(dev->iod_mempool); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); -- cgit v1.2.3 From 081a7d958ce4b65f9aab6e70e65b0b2e0b92297c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:41:41 +0100 Subject: nvme-pci: factor the iod mempool creation into a helper Add a helper to create the iod mempool. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b638f43f2df2..f7dab65bf504 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -390,14 +390,6 @@ static int nvme_pci_npages_sgl(void) PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -2762,6 +2754,22 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + size_t alloc_size = sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; + + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->iod_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_mempool) + return -ENOMEM; + return 0; +} + static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) @@ -3087,7 +3095,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int node, result = -ENOMEM; struct nvme_dev *dev; unsigned long quirks = id->driver_data; - size_t alloc_size; node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) @@ -3132,21 +3139,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } - /* - * Double check that our mempool alloc size will cover the biggest - * command we support. - */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); - - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; + result = nvme_pci_alloc_iod_mempool(dev); + if (result) goto release_pools; - } result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, quirks); -- cgit v1.2.3 From 2e87570be9d2746e7c4e7ab1cc18fd3ca7de2768 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:44:00 +0100 Subject: nvme-pci: factor out a nvme_pci_alloc_dev helper Add a helper that allocates the nvme_dev structure up to the point where we can call nvme_init_ctrl. This pairs with the free_ctrl method and can thus be used to cleanup the teardown path and make it more symmetric. Note that this now calls nvme_init_ctrl a lot earlier during probing, which also means the per-controller character device shows up earlier. Due to the controller state no commnds can be send on it, but it might make sense to delay the cdev registration until nvme_init_ctrl_finish. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 81 ++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 35 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f7dab65bf504..03c83cd724ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2777,6 +2777,7 @@ static void nvme_free_tagset(struct nvme_dev *dev) dev->ctrl.tagset = NULL; } +/* pairs with nvme_pci_alloc_dev */ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); @@ -3090,19 +3091,23 @@ static void nvme_async_probe(void *data, async_cookie_t cookie) nvme_put_ctrl(&dev->ctrl); } -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, + const struct pci_device_id *id) { - int node, result = -ENOMEM; - struct nvme_dev *dev; unsigned long quirks = id->driver_data; + int node = dev_to_node(&pdev->dev); + struct nvme_dev *dev; + int ret = -ENOMEM; - node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return -ENOMEM; + return NULL; + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; @@ -3110,25 +3115,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->queues = kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto free; + goto out_free_dev; dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on @@ -3138,34 +3129,54 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) "platform quirk: setting simple suspend\n"); quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } + ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (ret) + goto out_put_device; + return dev; - result = nvme_pci_alloc_iod_mempool(dev); +out_put_device: + put_device(dev->dev); + kfree(dev->queues); +out_free_dev: + kfree(dev); + return ERR_PTR(ret); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result = -ENOMEM; + + dev = nvme_pci_alloc_dev(pdev, id); + if (!dev) + return -ENOMEM; + + result = nvme_dev_map(dev); if (result) - goto release_pools; + goto out_uninit_ctrl; - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); + result = nvme_setup_prp_pools(dev); + if (result) + goto out_dev_unmap; + + result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto release_mempool; + goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + pci_set_drvdata(pdev, dev); nvme_reset_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); - return 0; - release_mempool: - mempool_destroy(dev->iod_mempool); - release_pools: +out_release_prp_pools: nvme_release_prp_pools(dev); - unmap: +out_dev_unmap: nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); return result; } -- cgit v1.2.3 From 3f30a79c2e2c7d5ad14dfc372adb248fc239c6c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:48:43 +0100 Subject: nvme-pci: set constant paramters in nvme_pci_alloc_ctrl Move setting of low-level constant parameters from nvme_reset_work to nvme_pci_alloc_ctrl. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 03c83cd724ec..9dcb35f14800 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2841,21 +2841,6 @@ static void nvme_reset_work(struct work_struct *work) nvme_start_admin_queue(&dev->ctrl); } - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); - - /* - * Limit the max command size to prevent iod->sg allocations going - * over a single page. - */ - dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); - dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * Don't limit the IOMMU merged segment size. - */ - dma_set_max_seg_size(dev->dev, 0xffffffff); - mutex_unlock(&dev->shutdown_lock); /* @@ -2869,12 +2854,6 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - /* - * We do not support an SGL for metadata (yet), so we are limited to a - * single integrity segment for the separate metadata pointer. - */ - dev->ctrl.max_integrity_segments = 1; - result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; @@ -3133,6 +3112,23 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, quirks); if (ret) goto out_put_device; + + dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); + dma_set_max_seg_size(&pdev->dev, 0xffffffff); + + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9); + dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * There is no support for SGLs for metadata (yet), so we are limited to + * a single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; return dev; out_put_device: -- cgit v1.2.3 From a6ee7f19ebfd158ffb3a6ebacf20ae43549bed05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Oct 2022 03:28:16 -0700 Subject: nvme-pci: call nvme_pci_configure_admin_queue from nvme_pci_enable nvme_pci_configure_admin_queue is called right after nvme_pci_enable, and it's work is undone by nvme_dev_disable. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9dcb35f14800..c2e3a87237da 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2639,7 +2639,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); - return 0; + + return nvme_pci_configure_admin_queue(dev); disable: pci_disable_device(pdev); @@ -2829,10 +2830,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out_unlock; - result = nvme_pci_configure_admin_queue(dev); - if (result) - goto out_unlock; - if (!dev->ctrl.admin_q) { result = nvme_pci_alloc_admin_tag_set(dev); if (result) -- cgit v1.2.3 From 65a54646420e1409760c2b9f0e1a5e5feca1364e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 10:56:49 +0100 Subject: nvme-pci: simplify nvme_dbbuf_dma_alloc Move the OACS check and the error checking into nvme_dbbuf_dma_alloc so that an upcoming second caller doesn't have to duplicate this boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c2e3a87237da..4da339690ec6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -239,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) return dev->nr_allocated_queues * 8 * dev->db_stride; } -static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) +static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { unsigned int mem_size = nvme_dbbuf_size(dev); + if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) + return; + if (dev->dbbuf_dbs) { /* * Clear the dbbuf memory so the driver doesn't observe stale @@ -250,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) */ memset(dev->dbbuf_dbs, 0, mem_size); memset(dev->dbbuf_eis, 0, mem_size); - return 0; + return; } dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_dbs_dma_addr, GFP_KERNEL); if (!dev->dbbuf_dbs) - return -ENOMEM; + goto fail; dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_eis_dma_addr, GFP_KERNEL); - if (!dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - return -ENOMEM; - } + if (!dev->dbbuf_eis) + goto fail_free_dbbuf_dbs; + return; - return 0; +fail_free_dbbuf_dbs: + dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs, + dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; +fail: + dev_warn(dev->dev, "unable to allocate dma for dbbuf\n"); } static void nvme_dbbuf_dma_free(struct nvme_dev *dev) @@ -2855,12 +2860,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { - result = nvme_dbbuf_dma_alloc(dev); - if (result) - dev_warn(dev->dev, - "unable to allocate dma for dbbuf\n"); - } + nvme_dbbuf_dma_alloc(dev); if (dev->ctrl.hmpre) { result = nvme_setup_host_mem(dev); -- cgit v1.2.3 From acb71e53bb47a08731aa77497cd3f1871cba59c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 12:05:58 +0100 Subject: nvme-pci: move the HMPRE check into nvme_setup_host_mem Check that a HMB is wanted into the allocation helper instead of the caller. This makes life simpler for an upcoming second caller. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4da339690ec6..57fb88396fd8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2102,6 +2102,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u32 enable_bits = NVME_HOST_MEM_ENABLE; int ret; + if (!dev->ctrl.hmpre) + return 0; + preferred = min(preferred, max); if (min > max) { dev_warn(dev->ctrl.device, @@ -2862,11 +2865,9 @@ static void nvme_reset_work(struct work_struct *work) nvme_dbbuf_dma_alloc(dev); - if (dev->ctrl.hmpre) { - result = nvme_setup_host_mem(dev); - if (result < 0) - goto out; - } + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; result = nvme_setup_io_queues(dev); if (result) -- cgit v1.2.3 From eac3ef262941f62fe01bc357911fea4847183333 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 12:07:04 +0100 Subject: nvme-pci: split the initial probe from the rest path nvme_reset_work is a little fragile as it needs to handle both resetting a live controller and initializing one during probe. Split out the initial probe and open code it in nvme_probe and leave nvme_reset_work to just do the live controller reset. This fixes a recently introduced bug where nvme_dev_disable causes a NULL pointer dereferences in blk_mq_quiesce_tagset because the tagset pointer is not set when the reset state is entered directly from the new state. The separate probe code can skip the reset state and probe directly and fixes this. To make sure the system isn't single threaded on enabling nvme controllers, set the PROBE_PREFER_ASYNCHRONOUS flag in the device_driver structure so that the driver core probes in parallel. Fixes: 98d81f0df70c ("nvme: use blk_mq_[un]quiesce_tagset") Reported-by: Gerd Bayer Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 133 +++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 53 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 57fb88396fd8..2521c00a0af6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2837,15 +2837,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - - if (!dev->ctrl.admin_q) { - result = nvme_pci_alloc_admin_tag_set(dev); - if (result) - goto out_unlock; - } else { - nvme_start_admin_queue(&dev->ctrl); - } - + nvme_start_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2873,37 +2865,23 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - if (dev->ctrl.tagset) { - /* - * This is a controller reset and we already have a tagset. - * Freeze and update the number of I/O queues as thos might have - * changed. If there are no I/O queues left after this reset, - * keep the controller around but remove all namespaces. - */ - if (dev->online_queues > 1) { - nvme_start_queues(&dev->ctrl); - nvme_wait_freeze(&dev->ctrl); - nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); - nvme_unfreeze(&dev->ctrl); - } else { - dev_warn(dev->ctrl.device, "IO queues lost\n"); - nvme_mark_namespaces_dead(&dev->ctrl); - nvme_start_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); - } + /* + * Freeze and update the number of I/O queues as thos might have + * changed. If there are no I/O queues left after this reset, keep the + * controller around but remove all namespaces. + */ + if (dev->online_queues > 1) { + nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); + nvme_pci_update_nr_queues(dev); + nvme_dbbuf_set(dev); + nvme_unfreeze(&dev->ctrl); } else { - /* - * First probe. Still allow the controller to show up even if - * there are no namespaces. - */ - if (dev->online_queues > 1) { - nvme_pci_alloc_tag_set(dev); - nvme_dbbuf_set(dev); - } else { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - } + dev_warn(dev->ctrl.device, "IO queues lost\n"); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_start_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); } /* @@ -3059,15 +3037,6 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } -static void nvme_async_probe(void *data, async_cookie_t cookie) -{ - struct nvme_dev *dev = data; - - flush_work(&dev->ctrl.reset_work); - flush_work(&dev->ctrl.scan_work); - nvme_put_ctrl(&dev->ctrl); -} - static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -3159,12 +3128,69 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + + result = nvme_pci_enable(dev); + if (result) + goto out_release_iod_mempool; + + result = nvme_pci_alloc_admin_tag_set(dev); + if (result) + goto out_disable; + + /* + * Mark the controller as connecting before sending admin commands to + * allow the timeout handler to do the right thing. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + result = -EBUSY; + goto out_disable; + } + + result = nvme_init_ctrl_finish(&dev->ctrl, false); + if (result) + goto out_disable; + + nvme_dbbuf_dma_alloc(dev); + + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out_disable; + + result = nvme_setup_io_queues(dev); + if (result) + goto out_disable; + + if (dev->online_queues > 1) { + nvme_pci_alloc_tag_set(dev); + nvme_dbbuf_set(dev); + } else { + dev_warn(dev->ctrl.device, "IO queues not created\n"); + } + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, + "failed to mark controller live state\n"); + result = -ENODEV; + goto out_disable; + } + pci_set_drvdata(pdev, dev); - nvme_reset_ctrl(&dev->ctrl); - async_schedule(nvme_async_probe, dev); + nvme_start_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return 0; +out_disable: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); + nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); + nvme_free_queues(dev, 0); +out_release_iod_mempool: + mempool_destroy(dev->iod_mempool); out_release_prp_pools: nvme_release_prp_pools(dev); out_dev_unmap: @@ -3560,11 +3586,12 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, -#ifdef CONFIG_PM_SLEEP .driver = { - .pm = &nvme_dev_pm_ops, - }, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, +#ifdef CONFIG_PM_SLEEP + .pm = &nvme_dev_pm_ops, #endif + }, .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; -- cgit v1.2.3 From c7c16c5b196772aba1d99c4c0b505fe99a6fbba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:42:57 +0100 Subject: nvme-pci: don't unbind the driver on reset failure Unbind a device driver when a reset fails is very unusual behavior. Just shut the controller down and leave it in dead state if we fail to reset it. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2521c00a0af6..d394498d96de 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -130,7 +130,6 @@ struct nvme_dev { u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; - struct work_struct remove_work; struct mutex shutdown_lock; bool subsystem; u64 cmb_size; @@ -2797,20 +2796,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - /* - * Set state to deleting now to avoid blocking nvme_wait_reset(), which - * may be holding this pci_dev's device lock. - */ - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - nvme_get_ctrl(&dev->ctrl); - nvme_dev_disable(dev, false); - nvme_mark_namespaces_dead(&dev->ctrl); - if (!queue_work(nvme_wq, &dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = @@ -2901,20 +2886,16 @@ static void nvme_reset_work(struct work_struct *work) out_unlock: mutex_unlock(&dev->shutdown_lock); out: - if (result) - dev_warn(dev->ctrl.device, - "Removing after probe failure status: %d\n", result); - nvme_remove_dead_ctrl(dev); -} - -static void nvme_remove_dead_ctrl_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_get_drvdata(pdev)) - device_release_driver(&pdev->dev); - nvme_put_ctrl(&dev->ctrl); + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n", + result); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) @@ -3052,7 +3033,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, if (!dev) return NULL; INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; -- cgit v1.2.3 From 0a7ce375f83f4ade7c2a835444093b6870fb8257 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:05 +0200 Subject: nvme-auth: rename __nvme_auth_[reset|free] to nvme_auth[reset|free]_dhchap nvme_auth_[reset|free] operate on the controller while __nvme_auth_[reset|free] operate on a chap struct (which maps to a queue context). Rename it for clarity. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c8a6db7c4498..d45333268fcf 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -654,7 +654,7 @@ gen_sesskey: return 0; } -static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) { kfree_sensitive(chap->host_response); chap->host_response = NULL; @@ -676,9 +676,9 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) memset(chap->c2, 0, sizeof(chap->c2)); } -static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) @@ -868,7 +868,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -928,7 +928,7 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); } mutex_unlock(&ctrl->dhchap_auth_mutex); } @@ -1002,7 +1002,7 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { list_del_init(&chap->entry); flush_work(&chap->auth_work); - __nvme_auth_free(chap); + nvme_auth_free_dhchap(chap); } mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { -- cgit v1.2.3 From 0c999e69c40a87285f910c400b550fad866e99d0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:06 +0200 Subject: nvme-auth: rename authentication work elements Use nvme_ctrl_auth_work and nvme_queue_auth_work for better readability. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d45333268fcf..e3e801e2b78d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -691,7 +691,7 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) kfree(chap); } -static void __nvme_auth_work(struct work_struct *work) +static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = container_of(work, struct nvme_dhchap_queue_context, auth_work); @@ -893,7 +893,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOMEM; } - INIT_WORK(&chap->auth_work, __nvme_auth_work); + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); list_add(&chap->entry, &ctrl->dhchap_auth_list); mutex_unlock(&ctrl->dhchap_auth_mutex); queue_work(nvme_wq, &chap->auth_work); @@ -934,7 +934,7 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_auth_reset); -static void nvme_dhchap_auth_work(struct work_struct *work) +static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, dhchap_auth_work); @@ -973,7 +973,7 @@ static void nvme_dhchap_auth_work(struct work_struct *work) void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); if (!ctrl->opts) return; -- cgit v1.2.3 From 100b555bc204fc754108351676297805f5affa49 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:07 +0200 Subject: nvme-auth: remove symbol export from nvme_auth_reset Only the nvme module calls it. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index e3e801e2b78d..2f823c6b84fd 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -932,7 +932,6 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->dhchap_auth_mutex); } -EXPORT_SYMBOL_GPL(nvme_auth_reset); static void nvme_ctrl_auth_work(struct work_struct *work) { -- cgit v1.2.3 From c7390f132a896ff1a3fa26ea2b0be4f9ceb9041e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:08 +0200 Subject: nvme-auth: don't re-authenticate if the controller is not LIVE The connect sequence will re-authenticate. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 2f823c6b84fd..4f2c8d0567bd 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -939,6 +939,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work) container_of(work, struct nvme_ctrl, dhchap_auth_work); int ret, q; + /* + * If the ctrl is no connected, bail as reconnect will handle + * authentication. + */ + if (ctrl->state != NVME_CTRL_LIVE) + return; + /* Authenticate admin queue first */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { -- cgit v1.2.3 From f6b182fbd5c608bd6cbaaaee35b1325443f48043 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:09 +0200 Subject: nvme-auth: remove redundant buffer deallocations host_response, host_key, ctrl_key and sess_key are freed in nvme_auth_reset_dhchap which is called from nvme_auth_free_dhchap. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 4f2c8d0567bd..0d0542e33484 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -683,10 +683,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree_sensitive(chap->ctrl_key); - kfree_sensitive(chap->host_key); - kfree_sensitive(chap->sess_key); - kfree_sensitive(chap->host_response); kfree(chap->buf); kfree(chap); } -- cgit v1.2.3 From 193a8c7e5f1a8481841636cec9c185543ec5c759 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:10 +0200 Subject: nvme-auth: don't ignore key generation failures when initializing ctrl keys nvme_auth_generate_key can fail, don't ignore it upon initialization. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 19 +++++++++++++++---- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/nvme.h | 7 +++++-- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0d0542e33484..d62862ef5b3f 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -972,15 +972,26 @@ static void nvme_ctrl_auth_work(struct work_struct *work) */ } -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { + int ret; + INIT_LIST_HEAD(&ctrl->dhchap_auth_list); INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); if (!ctrl->opts) - return; - nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); - nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); + return 0; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, + &ctrl->host_key); + if (ret) + return ret; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); + if (ret) { + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + } + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6040a13d3e2d..3d6751cbf40e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5119,9 +5119,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); nvme_mpath_init_ctrl(ctrl); - nvme_auth_init_ctrl(ctrl); + ret = nvme_auth_init_ctrl(ctrl); + if (ret) + goto out_free_cdev; return 0; +out_free_cdev: + cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 924ff80d85f6..47f96ab14c6a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1018,14 +1018,17 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) } #ifdef CONFIG_NVME_AUTH -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else -static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; +static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +{ + return 0; +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { -- cgit v1.2.3 From 01604350e14560d4d69323eb1ba12a257a643ea8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:11 +0200 Subject: nvme-auth: don't override ctrl keys before validation Replace ctrl ctrl_key/host_key only after nvme_auth_generate_key is successful. Also, this fixes a bug where the keys are leaked. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3d6751cbf40e..bad55fe0de7d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3776,13 +3776,17 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_secret)) { + struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; + host_key = ctrl->host_key; + ctrl->host_key = key; + nvme_auth_free_key(host_key); /* Key has changed; re-authentication with new key */ nvme_auth_reset(ctrl); } @@ -3826,13 +3830,17 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; + ctrl_key = ctrl->ctrl_key; + ctrl->ctrl_key = key; + nvme_auth_free_key(ctrl_key); /* Key has changed; re-authentication with new key */ nvme_auth_reset(ctrl); } -- cgit v1.2.3 From bfc4068e1e55e30a86f0e82e15163a60f99a894d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:12 +0200 Subject: nvme-auth: remove redundant if statement No one passes NVME_QID_ANY to nvme_auth_negotiate. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d62862ef5b3f..e7e4a00ee37e 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -874,7 +874,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) mutex_unlock(&ctrl->dhchap_auth_mutex); return -ENOMEM; } - chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; + chap->qid = qid; chap->ctrl = ctrl; /* -- cgit v1.2.3 From b7d604cae8f6edde53ac8aa9038ee154be562eb5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:13 +0200 Subject: nvme-auth: don't keep long lived 4k dhchap buffer dhchap structure is per-queue, it is wasteful to keep it for the entire lifetime of the queue. Allocate it dynamically and get rid of it after authentication. We don't need kzalloc because all accessors are clearing it before writing to it. Also, remove redundant chap buf_size which is always 4096, use a define instead. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index e7e4a00ee37e..0812eb9a5d0b 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -13,6 +13,8 @@ #include "fabrics.h" #include +#define CHAP_BUF_SIZE 4096 + struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; @@ -20,7 +22,6 @@ struct nvme_dhchap_queue_context { struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; void *buf; - size_t buf_size; int qid; int error; u32 s1; @@ -112,7 +113,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -147,7 +148,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); const char *hmac_name, *kpp_name; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -302,7 +303,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, if (chap->host_key_len) size += chap->host_key_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -347,7 +348,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, if (ctrl->ctrl_key) size += chap->hash_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -674,6 +675,8 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); + kfree(chap->buf); + chap->buf = NULL; } static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) @@ -683,7 +686,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree(chap->buf); kfree(chap); } @@ -695,6 +697,16 @@ static void nvme_queue_auth_work(struct work_struct *work) size_t tl; int ret = 0; + /* + * Allocate a large enough buffer for the entire negotiation: + * 4k is enough to ffdhe8192. + */ + chap->buf = kmalloc(CHAP_BUF_SIZE, GFP_KERNEL); + if (!chap->buf) { + chap->error = -ENOMEM; + return; + } + chap->transaction = ctrl->transaction++; /* DH-HMAC-CHAP Step 1: send negotiate */ @@ -716,8 +728,9 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive challenge, %s %d\n", @@ -779,8 +792,9 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive success1\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive success1, %s %d\n", @@ -876,19 +890,6 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) } chap->qid = qid; chap->ctrl = ctrl; - - /* - * Allocate a large enough buffer for the entire negotiation: - * 4k should be enough to ffdhe8192. - */ - chap->buf_size = 4096; - chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); - if (!chap->buf) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - kfree(chap); - return -ENOMEM; - } - INIT_WORK(&chap->auth_work, nvme_queue_auth_work); list_add(&chap->entry, &ctrl->dhchap_auth_list); mutex_unlock(&ctrl->dhchap_auth_mutex); -- cgit v1.2.3 From e481fc0a377798976d5c3044c7f10c86a8372b92 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 15 Nov 2022 17:08:06 +0100 Subject: nvme-auth: guarantee dhchap buffers under memory pressure We want to guarantee that we have chap buffers when a controller reconnects under memory pressure. Add a mempool specifically for that. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 30 ++++++++++++++++++++++++++++-- drivers/nvme/host/core.c | 6 ++++++ drivers/nvme/host/nvme.h | 9 +++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0812eb9a5d0b..1b44676b6155 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -14,6 +14,8 @@ #include #define CHAP_BUF_SIZE 4096 +static struct kmem_cache *nvme_chap_buf_cache; +static mempool_t *nvme_chap_buf_pool; struct nvme_dhchap_queue_context { struct list_head entry; @@ -675,7 +677,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); - kfree(chap->buf); + mempool_free(chap->buf, nvme_chap_buf_pool); chap->buf = NULL; } @@ -701,7 +703,7 @@ static void nvme_queue_auth_work(struct work_struct *work) * Allocate a large enough buffer for the entire negotiation: * 4k is enough to ffdhe8192. */ - chap->buf = kmalloc(CHAP_BUF_SIZE, GFP_KERNEL); + chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL); if (!chap->buf) { chap->error = -ENOMEM; return; @@ -1029,3 +1031,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) } } EXPORT_SYMBOL_GPL(nvme_auth_free); + +int __init nvme_init_auth(void) +{ + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", + CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nvme_chap_buf_cache) + return -ENOMEM; + + nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, + mempool_free_slab, nvme_chap_buf_cache); + if (!nvme_chap_buf_pool) + goto err_destroy_chap_buf_cache; + + return 0; +err_destroy_chap_buf_cache: + kmem_cache_destroy(nvme_chap_buf_cache); + return -ENOMEM; +} + +void __exit nvme_exit_auth(void) +{ + mempool_destroy(nvme_chap_buf_pool); + kmem_cache_destroy(nvme_chap_buf_cache); +} diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bad55fe0de7d..cb5e6d07a5f8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5347,8 +5347,13 @@ static int __init nvme_core_init(void) goto unregister_generic_ns; } + result = nvme_init_auth(); + if (result) + goto destroy_ns_chr; return 0; +destroy_ns_chr: + class_destroy(nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: @@ -5369,6 +5374,7 @@ out: static void __exit nvme_core_exit(void) { + nvme_exit_auth(); class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 47f96ab14c6a..ebd67e7dc11e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1018,6 +1018,8 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) } #ifdef CONFIG_NVME_AUTH +int __init nvme_init_auth(void); +void __exit nvme_exit_auth(void); int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); @@ -1029,6 +1031,13 @@ static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { return 0; } +static inline int __init nvme_init_auth(void) +{ + return 0; +} +static inline void __exit nvme_exit_auth(void) +{ +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { -- cgit v1.2.3 From 8d1c1904e94757b78c28fbbef9285e4101d86ee9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:15 +0200 Subject: nvme-auth: clear sensitive info right after authentication completes We don't want to keep authentication sensitive info in memory for unlimited amount of time. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 1b44676b6155..04cf183d9519 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -912,6 +912,8 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); return ret; } mutex_unlock(&ctrl->dhchap_auth_mutex); -- cgit v1.2.3 From 96df31839354c2bb9d2f0d51eb6c6f6b762fd150 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:16 +0200 Subject: nvme-auth: remove redundant deallocations These are now redundant as the dhchap context is removed after authentication completes. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 04cf183d9519..3bffd22221c9 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -200,12 +200,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return NVME_SC_AUTH_REQUIRED; } - /* Reset host response if the hash had been changed */ - if (chap->hash_id != data->hashid) { - kfree(chap->host_response); - chap->host_response = NULL; - } - chap->hash_id = data->hashid; chap->hash_len = data->hl; dev_dbg(ctrl->device, "qid %d: selected hash %s\n", @@ -222,14 +216,6 @@ select_kpp: return NVME_SC_AUTH_REQUIRED; } - /* Clear host and controller key to avoid accidental reuse */ - kfree_sensitive(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; - kfree_sensitive(chap->ctrl_key); - chap->ctrl_key = NULL; - chap->ctrl_key_len = 0; - if (chap->dhgroup_id == data->dhgid && (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { dev_dbg(ctrl->device, @@ -624,9 +610,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, if (ret) { dev_dbg(ctrl->device, "failed to generate public key, error %d\n", ret); - kfree(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -646,9 +629,6 @@ gen_sesskey: if (ret) { dev_dbg(ctrl->device, "failed to generate shared secret, error %d\n", ret); - kfree_sensitive(chap->sess_key); - chap->sess_key = NULL; - chap->sess_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } -- cgit v1.2.3 From e8a420efb637f52c586596283d6fd96f2a7ecb5c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:17 +0200 Subject: nvme-auth: no need to reset chap contexts on re-authentication Now that the chap context is reset upon completion, this is no longer needed. Also remove nvme_auth_reset as no callers are left. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 13 ------------- drivers/nvme/host/core.c | 4 ---- drivers/nvme/host/nvme.h | 1 - 3 files changed, 18 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 3bffd22221c9..d1d89920d03c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -901,19 +901,6 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) } EXPORT_SYMBOL_GPL(nvme_auth_wait); -void nvme_auth_reset(struct nvme_ctrl *ctrl) -{ - struct nvme_dhchap_queue_context *chap; - - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - nvme_auth_reset_dhchap(chap); - } - mutex_unlock(&ctrl->dhchap_auth_mutex); -} - static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cb5e6d07a5f8..2944d9b565a2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3787,8 +3787,6 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, host_key = ctrl->host_key; ctrl->host_key = key; nvme_auth_free_key(host_key); - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3841,8 +3839,6 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, ctrl_key = ctrl->ctrl_key; ctrl->ctrl_key = key; nvme_auth_free_key(ctrl_key); - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ebd67e7dc11e..e431ec348d52 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1024,7 +1024,6 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); -void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) -- cgit v1.2.3 From 546dea18c99928bb81392de63092da0e25d07b10 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:18 +0200 Subject: nvme-auth: check chap ctrl_key once constructed ctrl ctrl_key member may be overwritten from a sysfs context driven by the user. Once a queue local copy was created, use that instead to minimize checks on a shared resource. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d1d89920d03c..781a6003109d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -333,7 +333,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_success1_data *data = chap->buf; size_t size = sizeof(*data); - if (ctrl->ctrl_key) + if (chap->ctrl_key) size += chap->hash_len; if (size > CHAP_BUF_SIZE) { @@ -811,7 +811,7 @@ static void nvme_queue_auth_work(struct work_struct *work) goto fail2; } - if (ctrl->ctrl_key) { + if (chap->ctrl_key) { /* DH-HMAC-CHAP Step 5: send success2 */ dev_dbg(ctrl->device, "%s: qid %d send success2\n", __func__, chap->qid); -- cgit v1.2.3 From aa36d711e945e65fa87410927800f01878a8faed Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:20 +0200 Subject: nvme-auth: convert dhchap_auth_list to an array We know exactly how many dhchap contexts we will need, there is no need to hold a list that we need to protect with a mutex. Convert to a dynamically allocated array. And dhchap_context access state is maintained by the chap itself. Make dhchap_auth_mutex protect only the ctrl host_key and ctrl_key in a fine-grained lock such that there is no long lasting acquisition of the lock and no need to take/release this lock when flushing authentication works. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 118 +++++++++++++++++++++++++---------------------- drivers/nvme/host/core.c | 4 ++ drivers/nvme/host/nvme.h | 2 +- 3 files changed, 69 insertions(+), 55 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 781a6003109d..24726683d4bc 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -50,6 +50,12 @@ struct nvme_dhchap_queue_context { #define nvme_auth_queue_from_qid(ctrl, qid) \ (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q +static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues + + ctrl->opts->nr_poll_queues + 1; +} + static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, void *data, size_t data_len, bool auth_send) { @@ -510,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, ret = PTR_ERR(ctrl_response); return ret; } + ret = crypto_shash_setkey(chap->shash_tfm, ctrl_response, ctrl->ctrl_key->len); if (ret) { @@ -668,7 +675,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree(chap); } static void nvme_queue_auth_work(struct work_struct *work) @@ -748,11 +754,14 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d host response\n", __func__, chap->qid); + mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } + mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", @@ -793,16 +802,19 @@ static void nvme_queue_auth_work(struct work_struct *work) return; } + mutex_lock(&ctrl->dhchap_auth_mutex); if (ctrl->ctrl_key) { dev_dbg(ctrl->device, "%s: qid %d controller response\n", __func__, chap->qid); ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } } + mutex_unlock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { @@ -852,29 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOKEY; } - mutex_lock(&ctrl->dhchap_auth_mutex); - /* Check if the context is already queued */ - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - WARN_ON(!chap->buf); - if (chap->qid == qid) { - dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - nvme_auth_reset_dhchap(chap); - queue_work(nvme_wq, &chap->auth_work); - return 0; - } - } - chap = kzalloc(sizeof(*chap), GFP_KERNEL); - if (!chap) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENOMEM; - } - chap->qid = qid; - chap->ctrl = ctrl; - INIT_WORK(&chap->auth_work, nvme_queue_auth_work); - list_add(&chap->entry, &ctrl->dhchap_auth_list); - mutex_unlock(&ctrl->dhchap_auth_mutex); + chap = &ctrl->dhchap_ctxs[qid]; + cancel_work_sync(&chap->auth_work); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -885,19 +876,12 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) struct nvme_dhchap_queue_context *chap; int ret; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - if (chap->qid != qid) - continue; - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - ret = chap->error; - /* clear sensitive info */ - nvme_auth_reset_dhchap(chap); - return ret; - } - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENXIO; + chap = &ctrl->dhchap_ctxs[qid]; + flush_work(&chap->auth_work); + ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_wait); @@ -946,11 +930,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work) int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { - int ret; + struct nvme_dhchap_queue_context *chap; + int i, ret; - INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) return 0; ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, @@ -959,37 +943,63 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) return ret; ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); - if (ret) { - nvme_auth_free_key(ctrl->host_key); - ctrl->host_key = NULL; + if (ret) + goto err_free_dhchap_secret; + + if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) + return ret; + + ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), + sizeof(*chap), GFP_KERNEL); + if (!ctrl->dhchap_ctxs) { + ret = -ENOMEM; + goto err_free_dhchap_ctrl_secret; } + + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + chap->qid = i; + chap->ctrl = ctrl; + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); + } + + return 0; +err_free_dhchap_ctrl_secret: + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; +err_free_dhchap_secret: + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + struct nvme_dhchap_queue_context *chap; + int i; cancel_work_sync(&ctrl->dhchap_auth_work); - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; cancel_work_sync(&chap->auth_work); - mutex_unlock(&ctrl->dhchap_auth_mutex); + } } EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + struct nvme_dhchap_queue_context *chap; + int i; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { - list_del_init(&chap->entry); - flush_work(&chap->auth_work); - nvme_auth_free_dhchap(chap); + if (ctrl->dhchap_ctxs) { + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + flush_work(&chap->auth_work); + nvme_auth_free_dhchap(chap); + } + kfree(ctrl->dhchap_ctxs); } - mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { nvme_auth_free_key(ctrl->host_key); ctrl->host_key = NULL; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2944d9b565a2..bc0eede419cb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3785,7 +3785,9 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; host_key = ctrl->host_key; + mutex_lock(&ctrl->dhchap_auth_mutex); ctrl->host_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(host_key); } /* Start re-authentication */ @@ -3837,7 +3839,9 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; ctrl_key = ctrl->ctrl_key; + mutex_lock(&ctrl->dhchap_auth_mutex); ctrl->ctrl_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(ctrl_key); } /* Start re-authentication */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e431ec348d52..ef23c6c6e2a3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -337,8 +337,8 @@ struct nvme_ctrl { #ifdef CONFIG_NVME_AUTH struct work_struct dhchap_auth_work; - struct list_head dhchap_auth_list; struct mutex dhchap_auth_mutex; + struct nvme_dhchap_queue_context *dhchap_ctxs; struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; u16 transaction; -- cgit v1.2.3 From a2a00d2a66e480c8b225012db538dca6e389a92d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:21 +0200 Subject: nvme-auth: remove redundant auth_work flush only ctrl deletion calls nvme_auth_free, which was stopped prior in the teardown stage, so there is no possibility that it should ever run when nvme_auth_free is called. As a result, we can remove a local chap pointer variable. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 24726683d4bc..c9b3f0056afc 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -989,15 +989,11 @@ EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap; int i; if (ctrl->dhchap_ctxs) { - for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { - chap = &ctrl->dhchap_ctxs[i]; - flush_work(&chap->auth_work); - nvme_auth_free_dhchap(chap); - } + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) + nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]); kfree(ctrl->dhchap_ctxs); } if (ctrl->host_key) { -- cgit v1.2.3 From d061a1bd1fff5332ee48601947abb414007a9610 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:22 +0200 Subject: nvme-auth: have dhchap_auth_work wait for queues auth to complete It triggered the queue authentication work elements in parallel, but the ctrl authentication work itself completes when all of them completes. Hence wait for queues auth completions. This also makes nvme_auth_stop simply a sync cancel of ctrl dhchap_auth_work. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c9b3f0056afc..bb0abbe4491c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -926,6 +926,12 @@ static void nvme_ctrl_auth_work(struct work_struct *work) * Failure is a soft-state; credentials remain valid until * the controller terminates the connection. */ + for (q = 1; q < ctrl->queue_count; q++) { + ret = nvme_auth_wait(ctrl, q); + if (ret) + dev_warn(ctrl->device, + "qid %d: authentication failed\n", q); + } } int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) @@ -976,14 +982,7 @@ EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap; - int i; - cancel_work_sync(&ctrl->dhchap_auth_work); - for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { - chap = &ctrl->dhchap_ctxs[i]; - cancel_work_sync(&chap->auth_work); - } } EXPORT_SYMBOL_GPL(nvme_auth_stop); -- cgit v1.2.3 From 1f1a4f89562d3b33b6ca4fc8a4f3bd4cd35ab4ea Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:23 +0200 Subject: nvme-tcp: stop auth work after tearing down queues in error recovery when starting error recovery there might be a authentication work running, and it involves I/O commands. Given the controller is tearing down there is no chance for the I/O to complete other than timing out which may unnecessarily take a full io timeout. So first tear down the queues, fail/cancel all inflight I/O (including potentially authentication) and only then stop authentication. This ensures that failover is not stalled due to blocked authentication I/O. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4f8584657bb7..3fedddf0aedc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2119,7 +2119,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; - nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2127,6 +2126,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); nvme_start_admin_queue(ctrl); + nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ -- cgit v1.2.3 From 91c11d5f32547a08d462934246488fe72f3d44c3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:24 +0200 Subject: nvme-rdma: stop auth work after tearing down queues in error recovery when starting error recovery there might be a authentication work running, and it involves I/O commands. Given the controller is tearing down there is no chance for the I/O to complete other than timing out which may unnecessarily take a full io timeout. So first tear down the queues, fail/cancel all inflight I/O (including potentially authentication) and only then stop authentication. This ensures that failover is not stalled due to blocked authentication I/O. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ccd45e5b3298..3d78278e47c5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1153,13 +1153,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_auth_stop(&ctrl->ctrl); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); nvme_start_admin_queue(&ctrl->ctrl); + nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ -- cgit v1.2.3 From 811f4de0344d48a33a94d5c7d31a0ef0490f5701 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 14 Nov 2022 17:23:59 -0700 Subject: nvme: avoid fallback to sequential scan due to transient issues Currently, if nvme_scan_ns_list fails, nvme_scan_work will fall back to a sequential scan. nvme_scan_ns_list can fail for a variety of reasons, e.g. a transient transport issue, and the resulting sequential scan can be extremely expensive on controllers reporting an NN value close to the maximum allowed (> 4 billion). Avoid sequential scans wherever possible by only falling back to them in two cases: - When the NVMe version supported (VS) value reported by the device is older than NVME_VS(1, 1, 0), before which support of Identify NS List not required. - When the Identify NS List command fails with the DNR bit set in the status. This is to accommodate (non-compliant) devices which report a VS value which implies support for Identify NS List, but nevertheless do not support the command. Such devices will most likely fail the command with the DNR bit set. The third case is when the device claims support for Identify NS List but the command fails with DNR not set. In such cases, fallback to sequential scan is potentially expensive and likely unnecessary, as a retry of the list scan should succeed. So this change skips the fallback in this third case. Signed-off-by: Uday Shankar Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bc0eede419cb..cba45bad689b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4460,9 +4460,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; - if (nvme_ctrl_limited_cns(ctrl)) - return -EOPNOTSUPP; - ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; @@ -4570,8 +4567,18 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_scan_ns_list(ctrl) != 0) + if (nvme_ctrl_limited_cns(ctrl)) { nvme_scan_ns_sequential(ctrl); + } else { + /* + * Fall back to sequential scan if DNR is set to handle broken + * devices which should support Identify NS List (as per the VS + * they report) but don't actually support it. + */ + ret = nvme_scan_ns_list(ctrl); + if (ret > 0 && ret & NVME_SC_DNR) + nvme_scan_ns_sequential(ctrl); + } mutex_unlock(&ctrl->scan_lock); } -- cgit v1.2.3 From bcaf434b8f04e1ee82a8b1e1bce0de99fbff67fa Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 2 Nov 2022 18:17:08 +0100 Subject: nvme: return err on nvme_init_non_mdts_limits fail In nvme_init_non_mdts_limits function we were returning 0 when kzalloc failed; it now returns -ENOMEM. Fixes: 5befc7c26e5a ("nvme: implement non-mdts command limits") Signed-off-by: Joel Granados Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cba45bad689b..ca4d40996ac1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3078,7 +3078,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) - return 0; + return -ENOMEM; c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_CS_CTRL; -- cgit v1.2.3 From c58e28afb11f5cd3c7f8a27b3abb045d848467ac Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Nov 2022 08:45:02 +0200 Subject: nvmet: fix a memory leak in nvmet_auth_set_key When changing dhchap secrets we need to release the old secrets as well. kmemleak complaint: -- unreferenced object 0xffff8c7f44ed8180 (size 64): comm "check", pid 7304, jiffies 4295686133 (age 72034.246s) hex dump (first 32 bytes): 44 48 48 43 2d 31 3a 30 30 3a 4c 64 4c 4f 64 71 DHHC-1:00:LdLOdq 79 56 69 67 77 48 55 32 6d 5a 59 4c 7a 35 59 38 yVigwHU2mZYLz5Y8 backtrace: [<00000000b6fc5071>] kstrdup+0x2e/0x60 [<00000000f0f4633f>] 0xffffffffc0e07ee6 [<0000000053006c05>] 0xffffffffc0dff783 [<00000000419ae922>] configfs_write_iter+0xb1/0x120 [<000000008183c424>] vfs_write+0x2be/0x3c0 [<000000009005a2a5>] ksys_write+0x5f/0xe0 [<00000000cd495c89>] do_syscall_64+0x38/0x90 [<00000000f2a84ac5>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: db1312dd9548 ("nvmet: implement basic In-Band Authentication") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/auth.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index c4113b43dbfe..4dcddcf95279 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -45,9 +45,11 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, if (!dhchap_secret) return -ENOMEM; if (set_ctrl) { + kfree(host->dhchap_ctrl_secret); host->dhchap_ctrl_secret = strim(dhchap_secret); host->dhchap_ctrl_key_hash = key_hash; } else { + kfree(host->dhchap_secret); host->dhchap_secret = strim(dhchap_secret); host->dhchap_key_hash = key_hash; } -- cgit v1.2.3 From 9f27bd701d18f012646a06bc6c1b35d81f30359b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 11:22:14 +0100 Subject: nvme: rename the queue quiescing helpers Naming the nvme helpers that wrap the block quiesce functionality _start/_stop is rather confusing. Switch to using the quiesce naming used by the block layer instead. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/apple.c | 12 ++++++------ drivers/nvme/host/core.c | 24 ++++++++++++------------ drivers/nvme/host/fc.c | 16 ++++++++-------- drivers/nvme/host/nvme.h | 8 ++++---- drivers/nvme/host/pci.c | 16 ++++++++-------- drivers/nvme/host/rdma.c | 26 +++++++++++++------------- drivers/nvme/host/tcp.c | 28 ++++++++++++++-------------- drivers/nvme/target/loop.c | 6 +++--- 8 files changed, 68 insertions(+), 68 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index a85349a7e938..cab69516af5b 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&anv->ctrl); + nvme_quiesce_io_queues(&anv->ctrl); if (!dead) { if (READ_ONCE(anv->ioq.enabled)) { @@ -837,7 +837,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) WRITE_ONCE(anv->ioq.enabled, false); WRITE_ONCE(anv->adminq.enabled, false); mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */ - nvme_stop_admin_queue(&anv->ctrl); + nvme_quiesce_admin_queue(&anv->ctrl); /* last chance to complete any requests before nvme_cancel_request */ spin_lock_irqsave(&anv->lock, flags); @@ -854,8 +854,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&anv->ctrl); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); } } @@ -1093,7 +1093,7 @@ static void apple_nvme_reset_work(struct work_struct *work) dev_dbg(anv->dev, "Starting admin queue"); apple_nvme_init_queue(&anv->adminq); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(anv->ctrl.device, @@ -1127,7 +1127,7 @@ static void apple_nvme_reset_work(struct work_struct *work) anv->ctrl.queue_count = nr_io_queues + 1; - nvme_start_queues(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); nvme_wait_freeze(&anv->ctrl); blk_mq_update_nr_hw_queues(&anv->tagset, 1); nvme_unfreeze(&anv->ctrl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ca4d40996ac1..3195ae17df30 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4610,7 +4610,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) */ if (ctrl->state == NVME_CTRL_DEAD) { nvme_mark_namespaces_dead(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); } /* this is a no-op when called from the controller reset handler */ @@ -4737,7 +4737,7 @@ static void nvme_fw_act_work(struct work_struct *work) fw_act_timeout = jiffies + msecs_to_jiffies(admin_timeout * 1000); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); while (nvme_ctrl_pp_status(ctrl)) { if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, @@ -4751,7 +4751,7 @@ static void nvme_fw_act_work(struct work_struct *work) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); @@ -4996,7 +4996,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_mpath_update(ctrl); } @@ -5213,37 +5213,37 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_freeze); -void nvme_stop_queues(struct nvme_ctrl *ctrl) +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_quiesce_tagset(ctrl->tagset); else blk_mq_wait_quiesce_done(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_stop_queues); +EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues); -void nvme_start_queues(struct nvme_ctrl *ctrl) +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_unquiesce_tagset(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_start_queues); +EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } -EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); +EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_unquiesce_queue(ctrl->admin_q); } -EXPORT_SYMBOL_GPL(nvme_start_admin_queue); +EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1f9f4075794b..aa5fb56c07d9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2392,7 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); kfree(ctrl->queues); @@ -2493,13 +2493,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * (but with error status). */ if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); if (start_queues) - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); } /* @@ -2517,13 +2517,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); if (start_queues) - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); } static void @@ -3105,7 +3105,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) @@ -3251,10 +3251,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_fc_ctlr_inactive_on_rport(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ef23c6c6e2a3..b3a1c595d144 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -752,10 +752,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); -void nvme_stop_queues(struct nvme_ctrl *ctrl); -void nvme_start_queues(struct nvme_ctrl *ctrl); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl); +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d394498d96de..bd5fcdc9211c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1481,7 +1481,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - nvme_stop_admin_queue(&nvmeq->dev->ctrl); + nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; @@ -1741,7 +1741,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); @@ -2703,7 +2703,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&dev->ctrl); + nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { nvme_disable_io_queues(dev); @@ -2723,9 +2723,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } @@ -2822,7 +2822,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2856,7 +2856,7 @@ static void nvme_reset_work(struct work_struct *work) * controller around but remove all namespaces. */ if (dev->online_queues > 1) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); nvme_pci_update_nr_queues(dev); nvme_dbbuf_set(dev); @@ -2864,7 +2864,7 @@ static void nvme_reset_work(struct work_struct *work) } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); nvme_mark_namespaces_dead(&dev->ctrl); - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_free_tagset(dev); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3d78278e47c5..de591cdf78f3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -869,7 +869,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) @@ -878,7 +878,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return 0; out_quiesce_queue: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -922,7 +922,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_cleanup_tagset; if (!new) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -949,7 +949,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_tagset: @@ -964,12 +964,12 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) { - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); } nvme_rdma_destroy_admin_queue(ctrl); @@ -980,12 +980,12 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, { if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (remove) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_remove_io_tag_set(&ctrl->ctrl); } nvme_rdma_free_io_queues(ctrl); @@ -1106,7 +1106,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) destroy_io: if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); @@ -1115,7 +1115,7 @@ destroy_io: nvme_rdma_free_io_queues(ctrl); } destroy_admin: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1156,9 +1156,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { @@ -2207,7 +2207,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3fedddf0aedc..776b8d9dfca7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1875,7 +1875,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_cleanup_connect_q; if (!new) { - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -1902,7 +1902,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: @@ -1947,7 +1947,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); error = nvme_init_ctrl_finish(ctrl, false); if (error) @@ -1956,7 +1956,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return 0; out_quiesce_queue: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1972,12 +1972,12 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); if (remove) - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1986,14 +1986,14 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); nvme_start_freeze(ctrl); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); if (remove) - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); } @@ -2074,14 +2074,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) destroy_io: if (ctrl->queue_count > 1) { - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2123,9 +2123,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { @@ -2141,7 +2141,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); if (shutdown) nvme_shutdown_ctrl(ctrl); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 893c50f365c4..4173099ef9a4 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -375,7 +375,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) @@ -394,12 +394,12 @@ out_free_sq: static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); -- cgit v1.2.3 From 23855abdc4be03e17564f665b5d0029ef27abf7b Mon Sep 17 00:00:00 2001 From: Aleksandr Miloserdov Date: Tue, 15 Nov 2022 14:58:09 +0300 Subject: nvmet: expose IEEE OUI to configfs Allow user to set OUI for the controller vendor. Reviewed-by: Konstantin Shelekhin Reviewed-by: Dmitriy Bogdanov Signed-off-by: Aleksandr Miloserdov Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 7 ++---- drivers/nvme/target/configfs.c | 49 +++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 2 ++ drivers/nvme/target/nvmet.h | 1 + 4 files changed, 54 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c8a061ce3ee5..48a2f587f38a 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -372,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); + put_unaligned_le24(subsys->ieee_oui, id->ieee); + id->rab = 6; if (nvmet_is_disc_subsys(ctrl->subsys)) @@ -379,11 +381,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) else id->cntrltype = NVME_CTRL_IO; - /* - * XXX: figure out how we can assign a IEEE OUI, but until then - * the safest is to leave it as zeroes. - */ - /* we support multiple ports, multiples hosts and ANA: */ id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | NVME_CTRL_CMIC_ANA; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 051a420d818e..02797170dd91 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1259,6 +1259,54 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_model); +static ssize_t nvmet_subsys_attr_ieee_oui_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "0x%06x\n", subsys->ieee_oui); +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + uint32_t val = 0; + int ret; + + if (subsys->subsys_discovered) { + pr_err("Can't set IEEE OUI. 0x%06x is already assigned\n", + subsys->ieee_oui); + return -EINVAL; + } + + ret = kstrtou32(page, 0, &val); + if (ret < 0) + return ret; + + if (val >= 0x1000000) + return -EINVAL; + + subsys->ieee_oui = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_ieee_oui_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1320,6 +1368,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, + &nvmet_subsys_attr_attr_ieee_oui, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 171493bcf0fb..006d8c94f5e1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1561,6 +1561,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, goto free_subsys; } + subsys->ieee_oui = 0; + switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index bda1c1f71f39..976e11cd8c01 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -263,6 +263,7 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; char *model_number; + u32 ieee_oui; #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; -- cgit v1.2.3 From 68c5444c317208f5a114f671140373f47f0a2cf6 Mon Sep 17 00:00:00 2001 From: Aleksandr Miloserdov Date: Tue, 15 Nov 2022 14:58:10 +0300 Subject: nvmet: expose firmware revision to configfs Allow user to set currently active firmware revision Reviewed-by: Konstantin Shelekhin Reviewed-by: Dmitriy Bogdanov Signed-off-by: Aleksandr Miloserdov Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/configfs.c | 63 +++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 15 ++++++++-- drivers/nvme/target/nvmet.h | 2 ++ 4 files changed, 79 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 48a2f587f38a..6b46f90a63cf 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -370,7 +370,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number, strlen(subsys->model_number), ' '); memcpy_and_pad(id->fr, sizeof(id->fr), - UTS_RELEASE, strlen(UTS_RELEASE), ' '); + subsys->firmware_rev, strlen(subsys->firmware_rev), ' '); put_unaligned_le24(subsys->ieee_oui, id->ieee); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 02797170dd91..d48deb9bdb27 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1307,6 +1307,68 @@ static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui); +static ssize_t nvmet_subsys_attr_firmware_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "%s\n", subsys->firmware_rev); +} + +static ssize_t nvmet_subsys_attr_firmware_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int pos = 0, len; + char *val; + + if (subsys->subsys_discovered) { + pr_err("Can't set firmware revision. %s is already assigned\n", + subsys->firmware_rev); + return -EINVAL; + } + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + if (len > NVMET_FR_MAX_SIZE) { + pr_err("Firmware revision size can not exceed %d Bytes\n", + NVMET_FR_MAX_SIZE); + return -EINVAL; + } + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + val = kmemdup_nul(page, len, GFP_KERNEL); + if (!val) + return -ENOMEM; + + kfree(subsys->firmware_rev); + + subsys->firmware_rev = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_firmware_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_firmware_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_firmware); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1369,6 +1431,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, &nvmet_subsys_attr_attr_ieee_oui, + &nvmet_subsys_attr_attr_firmware, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 006d8c94f5e1..f66ed13d7c11 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -10,6 +10,8 @@ #include #include +#include + #define CREATE_TRACE_POINTS #include "trace.h" @@ -1563,6 +1565,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, subsys->ieee_oui = 0; + subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL); + if (!subsys->firmware_rev) { + ret = -ENOMEM; + goto free_mn; + } + switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; @@ -1574,14 +1582,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, default: pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); ret = -EINVAL; - goto free_mn; + goto free_fr; } subsys->type = type; subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, GFP_KERNEL); if (!subsys->subsysnqn) { ret = -ENOMEM; - goto free_mn; + goto free_fr; } subsys->cntlid_min = NVME_CNTLID_MIN; subsys->cntlid_max = NVME_CNTLID_MAX; @@ -1594,6 +1602,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, return subsys; +free_fr: + kfree(subsys->firmware_rev); free_mn: kfree(subsys->model_number); free_subsys: @@ -1613,6 +1623,7 @@ static void nvmet_subsys_free(struct kref *ref) kfree(subsys->subsysnqn); kfree(subsys->model_number); + kfree(subsys->firmware_rev); kfree(subsys); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 976e11cd8c01..89bedfcd974c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -29,6 +29,7 @@ #define NVMET_DEFAULT_CTRL_MODEL "Linux" #define NVMET_MN_MAX_SIZE 40 #define NVMET_SN_MAX_SIZE 20 +#define NVMET_FR_MAX_SIZE 8 /* * Supported optional AENs: @@ -264,6 +265,7 @@ struct nvmet_subsys { char *model_number; u32 ieee_oui; + char *firmware_rev; #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; -- cgit v1.2.3 From ba0718a6d67130fd4bff664b86d2aa63fa49e241 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 08:14:46 +0100 Subject: nvme: don't call blk_mq_{,un}quiesce_tagset when ctrl->tagset is NULL The NVMe drivers support a mode where no tagset is allocated for the I/O queues and only the admin queue is usable. In that case ctrl->tagset is NULL and we must not call the block per-tagset quiesce helpers that dereference it. Fixes: 98d81f0df70c ("nvme: use blk_mq_[un]quiesce_tagset") Reported-by: Gerd Bayer Reported-by: Chao Leng Signed-off-by: Christoph Hellwig Reviewed-by: Chao Leng --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3195ae17df30..fd2e26cb7a68 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5215,6 +5215,8 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze); void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl) { + if (!ctrl->tagset) + return; if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_quiesce_tagset(ctrl->tagset); else @@ -5224,6 +5226,8 @@ EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues); void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl) { + if (!ctrl->tagset) + return; if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_unquiesce_tagset(ctrl->tagset); } -- cgit v1.2.3 From 99722c8aa8b994db15ef60965c33f6a8e399b36c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 24 Nov 2022 22:28:12 +0100 Subject: nvme: use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Signed-off-by: Christophe JAILLET Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 ++- drivers/nvme/target/configfs.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bd5fcdc9211c..e0da4a6719a7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -2184,7 +2185,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, bool new; int ret; - if (strtobool(buf, &new) < 0) + if (kstrtobool(buf, &new) < 0) return -EINVAL; if (new == ndev->hmb) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d48deb9bdb27..682d88e5e43a 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -4,6 +4,7 @@ * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -267,7 +268,7 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item, struct nvmet_port *port = to_nvmet_port(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; if (nvmet_is_port_enabled(port, __func__)) @@ -532,7 +533,7 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, bool enable; int ret = 0; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) return -EINVAL; if (enable) @@ -556,7 +557,7 @@ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, struct nvmet_ns *ns = to_nvmet_ns(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; mutex_lock(&ns->subsys->lock); @@ -579,7 +580,7 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item, struct nvmet_ns *ns = to_nvmet_ns(item); bool val; - if (strtobool(page, &val)) + if (kstrtobool(page, &val)) return -EINVAL; if (!val) @@ -728,7 +729,7 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item, bool enable; int ret = 0; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) return -EINVAL; if (enable) @@ -995,7 +996,7 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, bool allow_any_host; int ret = 0; - if (strtobool(page, &allow_any_host)) + if (kstrtobool(page, &allow_any_host)) return -EINVAL; down_write(&nvmet_config_sem); @@ -1382,7 +1383,7 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); bool pi_enable; - if (strtobool(page, &pi_enable)) + if (kstrtobool(page, &pi_enable)) return -EINVAL; subsys->pi_support = pi_enable; @@ -1511,7 +1512,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item, struct nvmet_port *port = to_nvmet_port(item); bool enable; - if (strtobool(page, &enable)) + if (kstrtobool(page, &enable)) goto inval; if (enable) -- cgit v1.2.3 From 6887fc6495f2dfd55e088c982e983815278ee453 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 3 Oct 2022 12:43:43 +0300 Subject: nvme: introduce nvme_start_request In preparation for nvme-multipath IO stats accounting, we want the accounting to happen in a centralized place. The request completion is already centralized, but we need a common helper to request I/O start. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke --- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 7 files changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index cab69516af5b..94ef797e8b4a 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_free_cmd; } - blk_mq_start_request(req); + nvme_start_request(req); apple_nvme_submit_cmd(q, cmnd); return BLK_STS_OK; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa5fb56c07d9..489f5e797204 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2733,7 +2733,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, atomic_set(&op->state, FCPOP_STATE_ACTIVE); if (!(op->flags & FCOP_FLAGS_AEN)) - blk_mq_start_request(op->rq); + nvme_start_request(op->rq); cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn)); ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b3a1c595d144..8522d6dc93e8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1012,6 +1012,11 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) } #endif +static inline void nvme_start_request(struct request *rq) +{ + blk_mq_start_request(rq); +} + static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) { return ctrl->sgls & ((1 << 0) | (1 << 1)); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e0da4a6719a7..ac734c8f6640 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -907,7 +907,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) goto out_unmap_data; } - blk_mq_start_request(req); + nvme_start_request(req); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index de591cdf78f3..448abf8cdf1f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2040,7 +2040,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) goto unmap_qe; - blk_mq_start_request(rq); + nvme_start_request(rq); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && queue->pi_support && diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 776b8d9dfca7..79789daddeac 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2405,7 +2405,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(ret)) return ret; - blk_mq_start_request(rq); + nvme_start_request(rq); nvme_tcp_queue_request(req, true, bd->last); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4173099ef9a4..6d176621f46d 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -145,7 +145,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - blk_mq_start_request(req); + nvme_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; iod->req.port = queue->ctrl->port; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, -- cgit v1.2.3 From d4d957b53d91eebc8c681c480edfdc697e55231e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 29 Nov 2022 15:43:19 +0100 Subject: nvme-multipath: support io stats on the mpath device Our mpath stack device is just a shim that selects a bottom namespace and submits the bio to it without any fancy splitting. This also means that we don't clone the bio or have any context to the bio beyond submission. However it really sucks that we don't see the mpath device io stats. Given that the mpath device can't do that without adding some context to it, we let the bottom device do it on its behalf (somewhat similar to the approach taken in nvme_trace_bio_complete). When the IO starts, we account the request for multipath IO stats using REQ_NVME_MPATH_IO_STATS nvme_request flag to avoid queue io stats disable in the middle of the request. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/multipath.c | 26 ++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 14 ++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd2e26cb7a68..7961e146bbb1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req) nvme_log_error(req); nvme_end_req_zoned(req); nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); blk_mq_end_request(req, status); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0ea7e441e080..898921ee6cb6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } +void nvme_mpath_start_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns->head->disk; + + if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) + return; + + nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, + blk_rq_bytes(rq) >> SECTOR_SHIFT, + req_op(rq), jiffies); +} +EXPORT_SYMBOL_GPL(nvme_mpath_start_request); + +void nvme_mpath_end_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + + if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) + return; + bdev_end_io_acct(ns->head->disk->part0, req_op(rq), + nvme_req(rq)->start_time); +} + void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; @@ -503,6 +528,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); /* * This assumes all controllers that refer to a namespace either * support poll queues or not. That is not a strict guarantee, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8522d6dc93e8..e01fa9f696fa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,9 @@ struct nvme_request { u8 retries; u8 flags; u16 status; +#ifdef CONFIG_NVME_MULTIPATH + unsigned long start_time; +#endif struct nvme_ctrl *ctrl; }; @@ -173,6 +176,7 @@ struct nvme_request { enum { NVME_REQ_CANCELLED = (1 << 0), NVME_REQ_USERCMD = (1 << 1), + NVME_MPATH_IO_STATS = (1 << 2), }; static inline struct nvme_request *nvme_req(struct request *req) @@ -882,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_start_request(struct request *rq); +void nvme_mpath_end_request(struct request *rq); static inline void nvme_trace_bio_complete(struct request *req) { @@ -967,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_start_request(struct request *rq) +{ +} +static inline void nvme_mpath_end_request(struct request *rq) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); @@ -1014,6 +1026,8 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) static inline void nvme_start_request(struct request *rq) { + if (rq->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_start_request(rq); blk_mq_start_request(rq); } -- cgit v1.2.3 From ea43fceea4171f29457f8d46543ec320b777c1c7 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 5 Dec 2022 15:27:46 +0100 Subject: nvme: allow unprivileged passthrough of Identify Controller Add unprivileged passthrough of the I/O Command Set Independent and I/O Command Set Specific Identify Controller sub-command. This will allow access to attributes (e.g. MDTS and WZSL) that are needed to effectively form passthrough I/O to the /dev/ng* character devices. Signed-off-by: Joel Granados Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9550a69029b3..9ddda571f046 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -34,6 +34,8 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, case NVME_ID_CNS_NS: case NVME_ID_CNS_CS_NS: case NVME_ID_CNS_NS_CS_INDEP: + case NVME_ID_CNS_CS_CTRL: + case NVME_ID_CNS_CTRL: return true; } } -- cgit v1.2.3 From 6c90294d72a99e3ed51516124ba0984e28937d2d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2022 13:33:40 -0800 Subject: nvme-fc: avoid null pointer dereference Before using dynamically allcoated variable lsop in the nvme_fc_rcv_ls_req(), add a check for NULL and error out early. Signed-off-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 489f5e797204..06bae7150087 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1754,9 +1754,18 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, } lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); + if (!lsop) { + dev_info(lport->dev, + "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -ENOMEM; + goto out_put; + } + lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); - if (!lsop || !lsop->rqstbuf || !lsop->rspbuf) { + if (!lsop->rqstbuf || !lsop->rspbuf) { dev_info(lport->dev, "RCV %s LS failed: No memory\n", (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? -- cgit v1.2.3 From b2969585572e5ddaa239d53ee68e2824df7a7736 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2022 13:33:41 -0800 Subject: nvme-fc: move common code into helper Add a helper to move the duplicate code for error message from nvme_fc_rcv_ls_req() to nvme_fc_rcv_ls_req_err_msg(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 06bae7150087..42fa450187f8 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1701,6 +1701,15 @@ restart: spin_unlock_irqrestore(&rport->lock, flags); } +static +void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport, + struct fcnvme_ls_rqst_w0 *w0) +{ + dev_info(lport->dev, "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); +} + /** * nvme_fc_rcv_ls_req - transport entry point called by an LLDD * upon the reception of a NVME LS request. @@ -1755,10 +1764,7 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); if (!lsop) { - dev_info(lport->dev, - "RCV %s LS failed: No memory\n", - (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? - nvmefc_ls_names[w0->ls_cmd] : ""); + nvme_fc_rcv_ls_req_err_msg(lport, w0); ret = -ENOMEM; goto out_put; } @@ -1766,10 +1772,7 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); if (!lsop->rqstbuf || !lsop->rspbuf) { - dev_info(lport->dev, - "RCV %s LS failed: No memory\n", - (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? - nvmefc_ls_names[w0->ls_cmd] : ""); + nvme_fc_rcv_ls_req_err_msg(lport, w0); ret = -ENOMEM; goto out_free; } -- cgit v1.2.3 From c76b8308e4c9148e44e0c7e086ab6d8b4bb10162 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 11:14:09 +0100 Subject: nvme-apple: fix controller shutdown in apple_nvme_disable nvme_shutdown_ctrl already shuts the controller down, there is no need to also call nvme_disable_ctrl for the shutdown case. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Eric Curtin Reviewed-by: Sagi Grimberg Reviewed-by: Hector Martin --- drivers/nvme/host/apple.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 94ef797e8b4a..56d9e9be945b 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -831,7 +831,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&anv->ctrl); - nvme_disable_ctrl(&anv->ctrl); + else + nvme_disable_ctrl(&anv->ctrl); } WRITE_ONCE(anv->ioq.enabled, false); -- cgit v1.2.3 From e6d275de2e4aaebdc97aa022c031a0dcc475efa4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 08:54:26 +0100 Subject: nvme: use nvme_wait_ready in nvme_shutdown_ctrl Refactor the code to wait for CSTS state changes so that it can be reused by nvme_shutdown_ctrl. This reduces the delay between each iteration that checks CSTS from 100ms in the shutdown code to the 1 to 2ms range done during enable, matching the changes from commit 3e98c2443f5c that were only applied to the enable/disable path. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Pankaj Raghav --- drivers/nvme/host/core.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7961e146bbb1..03b2e34dcf72 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2252,16 +2252,17 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val, + u32 timeout, const char *op) { - unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies; - u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + unsigned long timeout_jiffies = jiffies + timeout * HZ; + u32 csts; int ret; while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { if (csts == ~0) return -ENODEV; - if ((csts & NVME_CSTS_RDY) == bit) + if ((csts & mask) == val) break; usleep_range(1000, 2000); @@ -2270,7 +2271,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) if (time_after(jiffies, timeout_jiffies)) { dev_err(ctrl->device, "Device not ready; aborting %s, CSTS=0x%x\n", - enabled ? "initialisation" : "reset", csts); + op, csts); return -ENODEV; } } @@ -2297,8 +2298,8 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - - return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0, + (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset"); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); @@ -2363,14 +2364,13 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, timeout, true); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY, + (timeout + 1) / 2, "initialisation"); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); - u32 csts; int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; @@ -2379,22 +2379,8 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - - while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { - if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) - break; - - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(ctrl->device, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return ret; + return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK, NVME_CSTS_SHST_CMPLT, + ctrl->shutdown_timeout, "shutdown"); } EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); -- cgit v1.2.3 From 285b6e9b571714270fa503d0e32f244d15b9f85f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 11:20:12 +0100 Subject: nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl Many of the callers decide which one to use based on a bool argument and there is at least some code to be shared, so merge these two. Also move a comment specific to a single callsite to that callsite. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Hector Martin --- drivers/nvme/host/apple.c | 5 +---- drivers/nvme/host/core.c | 33 ++++++++++----------------------- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 3 +-- drivers/nvme/host/pci.c | 15 +++++++++------ drivers/nvme/host/rdma.c | 5 +---- drivers/nvme/host/tcp.c | 5 +---- drivers/nvme/target/loop.c | 2 +- 8 files changed, 25 insertions(+), 45 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 56d9e9be945b..e36aeb50b4ed 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -829,10 +829,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) apple_nvme_remove_cq(anv); } - if (shutdown) - nvme_shutdown_ctrl(&anv->ctrl); - else - nvme_disable_ctrl(&anv->ctrl); + nvme_disable_ctrl(&anv->ctrl, shutdown); } WRITE_ONCE(anv->ioq.enabled, false); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 03b2e34dcf72..30717f7cfc94 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2279,23 +2279,25 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val, return ret; } -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config &= ~NVME_CC_ENABLE; + if (shutdown) + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + else + ctrl->ctrl_config &= ~NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; + if (shutdown) { + return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK, + NVME_CSTS_SHST_CMPLT, + ctrl->shutdown_timeout, "shutdown"); + } if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0, @@ -2369,21 +2371,6 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) -{ - int ret; - - ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK, NVME_CSTS_SHST_CMPLT, - ctrl->shutdown_timeout, "shutdown"); -} -EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 42fa450187f8..bb89c7f7196a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2518,7 +2518,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * Other transports, which don't have link-level contexts bound * to sqe's, would try to gracefully shutdown the controller by * writing the registers for shutdown and polling (call - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * nvme_disable_ctrl()). Given a bunch of i/o was potentially * just aborted and we will wait on those contexts, and given * there was no indication of how live the controlelr is on the * link, don't send more io to create more contexts for the diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e01fa9f696fa..2cad9f6f2282 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -733,9 +733,8 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ac734c8f6640..84226dce9b3b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1500,11 +1500,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; - if (shutdown) - nvme_shutdown_ctrl(&dev->ctrl); - else - nvme_disable_ctrl(&dev->ctrl); - + nvme_disable_ctrl(&dev->ctrl, shutdown); nvme_poll_irqdisable(nvmeq); } @@ -1819,7 +1815,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl); + /* + * If the device has been passed off to us in an enabled state, just + * clear the enabled bit. The spec says we should set the 'shutdown + * notification bits', but doing so may cause the device to complete + * commands to the admin queue ... and we don't know what memory that + * might be pointing at! + */ + result = nvme_disable_ctrl(&dev->ctrl, false); if (result < 0) return result; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 448abf8cdf1f..cc61a1b8311b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2208,10 +2208,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); nvme_quiesce_admin_queue(&ctrl->ctrl); - if (shutdown) - nvme_shutdown_ctrl(&ctrl->ctrl); - else - nvme_disable_ctrl(&ctrl->ctrl); + nvme_disable_ctrl(&ctrl->ctrl, shutdown); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 79789daddeac..95e54e9c1bb1 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2142,10 +2142,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); nvme_quiesce_admin_queue(ctrl); - if (shutdown) - nvme_shutdown_ctrl(ctrl); - else - nvme_disable_ctrl(ctrl); + nvme_disable_ctrl(ctrl, shutdown); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 6d176621f46d..0015aed5c169 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -401,7 +401,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_quiesce_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) - nvme_shutdown_ctrl(&ctrl->ctrl); + nvme_disable_ctrl(&ctrl->ctrl, true); nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); -- cgit v1.2.3 From 47d42d229a181e52b32fa9c80aaa36cf8b6b46fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 08:59:55 +0100 Subject: nvme-pci: remove nvme_disable_admin_queue nvme_disable_admin_queue has only a single caller, and just calls two other funtions, so remove it to clean up the remove path a little more. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Eric Curtin Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 84226dce9b3b..c6a02210d22b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1496,14 +1496,6 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev) nvme_suspend_queue(&dev->queues[i]); } -static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) -{ - struct nvme_queue *nvmeq = &dev->queues[0]; - - nvme_disable_ctrl(&dev->ctrl, shutdown); - nvme_poll_irqdisable(nvmeq); -} - /* * Called only on a device that has been disabled and after all other threads * that can check this device's completion queues have synced, except @@ -2711,7 +2703,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (!dead && dev->ctrl.queue_count > 0) { nvme_disable_io_queues(dev); - nvme_disable_admin_queue(dev, shutdown); + nvme_disable_ctrl(&dev->ctrl, shutdown); + nvme_poll_irqdisable(&dev->queues[0]); } nvme_suspend_io_queues(dev); nvme_suspend_queue(&dev->queues[0]); -- cgit v1.2.3 From c80767f770ed722264688f65c89e420971eb897a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 09:01:30 +0100 Subject: nvme-pci: remove nvme_pci_disable nvme_pci_disable has a single caller, fold it into that. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Eric Curtin Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c6a02210d22b..c3d9b23699d3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2657,18 +2657,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_mem_regions(to_pci_dev(dev->dev)); } -static void nvme_pci_disable(struct nvme_dev *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - - pci_free_irq_vectors(pdev); - - if (pci_is_enabled(pdev)) { - pci_disable_pcie_error_reporting(pdev); - pci_disable_device(pdev); - } -} - static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { bool dead = true, freeze = false; @@ -2708,7 +2696,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_suspend_io_queues(dev); nvme_suspend_queue(&dev->queues[0]); - nvme_pci_disable(dev); + pci_free_irq_vectors(pdev); + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } nvme_reap_pending_cqes(dev); nvme_cancel_tagset(&dev->ctrl); -- cgit v1.2.3 From 10981f23a1b865290a5e21ad43a0f93710271cc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Nov 2022 10:51:05 +0100 Subject: nvme-pci: cleanup nvme_suspend_queue Remove the unused returne value, pass a dev + qid instead of the queue as that is better for the callers as well as the function itself, and remove the entirely pointless kerneldoc comment. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c3d9b23699d3..2c8dd0f7ef46 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1468,14 +1468,12 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) } } -/** - * nvme_suspend_queue - put queue into suspended state - * @nvmeq: queue to suspend - */ -static int nvme_suspend_queue(struct nvme_queue *nvmeq) +static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid) { + struct nvme_queue *nvmeq = &dev->queues[qid]; + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) - return 1; + return; /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb(); @@ -1484,8 +1482,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) - pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); - return 0; + pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq); } static void nvme_suspend_io_queues(struct nvme_dev *dev) @@ -1493,7 +1490,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev) int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(&dev->queues[i]); + nvme_suspend_queue(dev, i); } /* @@ -2695,7 +2692,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_poll_irqdisable(&dev->queues[0]); } nvme_suspend_io_queues(dev); - nvme_suspend_queue(&dev->queues[0]); + nvme_suspend_queue(dev, 0); pci_free_irq_vectors(pdev); if (pci_is_enabled(pdev)) { pci_disable_pcie_error_reporting(pdev); -- cgit v1.2.3 From 7d879c90ae6cf58f4a4caad8562d700b95bc1dbe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 09:07:48 +0100 Subject: nvme-pci: rename nvme_disable_io_queues This function really deletes the I/O queues, so rename it to match the functionality. Also move the main wrapper right next to the actual underlying implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c8dd0f7ef46..1b527377e351 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -109,7 +109,7 @@ struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +static void nvme_delete_io_queues(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -2310,12 +2310,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) - __nvme_disable_io_queues(dev, nvme_admin_delete_cq); -} - static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { /* @@ -2423,7 +2417,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->online_queues - 1 < dev->max_qid) { nr_io_queues = dev->online_queues - 1; - nvme_disable_io_queues(dev); + nvme_delete_io_queues(dev); result = nvme_setup_io_queues_trylock(dev); if (result) return result; @@ -2487,7 +2481,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) +static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode) { int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; @@ -2515,6 +2509,12 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) return true; } +static void nvme_delete_io_queues(struct nvme_dev *dev) +{ + if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq)) + __nvme_delete_io_queues(dev, nvme_admin_delete_cq); +} + static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) { struct blk_mq_tag_set * set = &dev->tagset; @@ -2687,7 +2687,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); + nvme_delete_io_queues(dev); nvme_disable_ctrl(&dev->ctrl, shutdown); nvme_poll_irqdisable(&dev->queues[0]); } -- cgit v1.2.3 From 8cb9f10b7151e308824cdcf3168010d673e7888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Nov 2022 08:48:13 +0100 Subject: nvme-pci: return early on ctrl state mismatch in nvme_reset_work The only way nvme_reset_work could be called when not in resetting state is if a reset and remove happen near the same time. This should not happen, but if it did we don't want the reset work to disable the controller because the remove is already doing that. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1b527377e351..02940b4f42b1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2792,8 +2792,7 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.state != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); - result = -ENODEV; - goto out; + return; } /* -- cgit v1.2.3 From 68e81eba6763cd834aa8ff9ac0cd6174fa69fa39 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 09:09:48 +0100 Subject: nvme-pci: split out a nvme_pci_ctrl_is_dead helper Clean up nvme_dev_disable by splitting the logic to detect if a controller is dead into a separate helper. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 02940b4f42b1..d613b4292c0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2654,36 +2654,39 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_mem_regions(to_pci_dev(dev->dev)); } -static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) +static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev) { - bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev); + u32 csts; - mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { - u32 csts; + if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev)) + return true; + if (pdev->error_state != pci_channel_io_normal) + return true; - if (pci_device_is_present(pdev)) - csts = readl(dev->bar + NVME_REG_CSTS); - else - csts = ~0; + csts = readl(dev->bar + NVME_REG_CSTS); + return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY); +} + +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + bool dead; - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { - freeze = true; + mutex_lock(&dev->shutdown_lock); + dead = nvme_pci_ctrl_is_dead(dev); + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) { + if (pci_is_enabled(pdev)) nvme_start_freeze(&dev->ctrl); - } - dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || - pdev->error_state != pci_channel_io_normal); + /* + * Give the controller a chance to complete all entered requests + * if doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); } - /* - * Give the controller a chance to complete all entered requests if - * doing a safe shutdown. - */ - if (!dead && shutdown && freeze) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { -- cgit v1.2.3 From dcef77274ae52136925287b6b59d5c6e6a4adfb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2022 17:16:52 +0100 Subject: nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set Don't look at ctrl->ops as only RDMA and TCP actually support multiple maps. Fixes: 6dfba1c09c10 ("nvme-fc: use the tagset alloc/free helpers") Fixes: ceee1953f923 ("nvme-loop: use the tagset alloc/free helpers") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 5 ++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/rdma.c | 4 +++- drivers/nvme/host/tcp.c | 1 + drivers/nvme/target/loop.c | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 30717f7cfc94..3b369900928f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4891,7 +4891,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size) + unsigned int nr_maps, unsigned int cmd_size) { int ret; @@ -4905,8 +4905,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - if (ops->map_queues) - set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + set->nr_maps = nr_maps; ret = blk_mq_alloc_tag_set(set); if (ret) return ret; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index bb89c7f7196a..1a4e009ca542 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2916,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, 1, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2cad9f6f2282..6c4565435fd9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -747,7 +747,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size); + unsigned int nr_maps, unsigned int cmd_size); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cc61a1b8311b..cf8f500405b1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -798,7 +798,9 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) NVME_RDMA_METADATA_SGL_SIZE; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, - &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size); + &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, + cmd_size); } static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 95e54e9c1bb1..fa245a50f630 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1859,6 +1859,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, &nvme_tcp_mq_ops, BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, sizeof(struct nvme_tcp_request)); if (ret) goto out_free_io_queues; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 0015aed5c169..da32727e6232 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -494,7 +494,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) return ret; ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, 1, sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (ret) -- cgit v1.2.3 From db45e1a5ddccc034eb60d62fc5352022d7963ae2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2022 17:19:50 +0100 Subject: nvme: consolidate setting the tagset flags All nvme transports should be using the same flags for their tagsets, with the exception for the blocking flag that should only be set for transports that can block in ->queue_rq. Add a NVME_F_BLOCKING flag to nvme_ctrl_ops to control the blocking behavior and lift setting the flags into nvme_alloc_{admin,io}_tag_set. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 15 +++++++++------ drivers/nvme/host/fc.c | 4 ++-- drivers/nvme/host/nvme.h | 9 +++++---- drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/host/tcp.c | 5 ++--- drivers/nvme/target/loop.c | 4 ++-- 6 files changed, 21 insertions(+), 19 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3b369900928f..f31586c46893 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4831,8 +4831,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, EXPORT_SYMBOL_GPL(nvme_complete_async_event); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size) + const struct blk_mq_ops *ops, unsigned int cmd_size) { int ret; @@ -4842,7 +4841,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->ops->flags & NVME_F_FABRICS) set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_NO_SCHED; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = 1; @@ -4890,8 +4891,8 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int nr_maps, unsigned int cmd_size) + const struct blk_mq_ops *ops, unsigned int nr_maps, + unsigned int cmd_size) { int ret; @@ -4900,7 +4901,9 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->queue_depth = ctrl->sqsize + 1; set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size, set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1a4e009ca542..4564f16a0b20 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2916,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, 1, + &nvme_fc_mq_ops, 1, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) @@ -3522,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED, + &nvme_fc_admin_mq_ops, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6c4565435fd9..6bbb73ef8b25 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -512,6 +512,8 @@ struct nvme_ctrl_ops { unsigned int flags; #define NVME_F_FABRICS (1 << 0) #define NVME_F_METADATA_SUPPORTED (1 << 1) +#define NVME_F_BLOCKING (1 << 2) + const struct attribute_group **dev_attr_groups; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); @@ -742,12 +744,11 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size); + const struct blk_mq_ops *ops, unsigned int cmd_size); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int nr_maps, unsigned int cmd_size); + const struct blk_mq_ops *ops, unsigned int nr_maps, + unsigned int cmd_size); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cf8f500405b1..bbad26b82b56 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -798,7 +798,7 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) NVME_RDMA_METADATA_SGL_SIZE; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, - &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_rdma_mq_ops, ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, cmd_size); } @@ -848,7 +848,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (new) { error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, - BLK_MQ_F_NO_SCHED, sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE); if (error) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index fa245a50f630..f07b1fccaf19 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1858,7 +1858,6 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (new) { ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, &nvme_tcp_mq_ops, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, sizeof(struct nvme_tcp_request)); if (ret) @@ -1934,7 +1933,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (new) { error = nvme_alloc_admin_tag_set(ctrl, &to_tcp_ctrl(ctrl)->admin_tag_set, - &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING, + &nvme_tcp_admin_mq_ops, sizeof(struct nvme_tcp_request)); if (error) goto out_free_queue; @@ -2512,7 +2511,7 @@ static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .name = "tcp", .module = THIS_MODULE, - .flags = NVME_F_FABRICS, + .flags = NVME_F_FABRICS | NVME_F_BLOCKING, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index da32727e6232..f2d24b2d992f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -353,7 +353,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.queue_count = 1; error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED, + &nvme_loop_admin_mq_ops, sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (error) @@ -494,7 +494,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) return ret; ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, 1, + &nvme_loop_mq_ops, 1, sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (ret) -- cgit v1.2.3 From b794d1c2ad6d7921f2867ce393815ad31b5b5a83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2022 17:27:07 +0100 Subject: nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers The reserved_tags are only needed for fabrics controllers. Right now only fabrics drivers call this helper, so this is harmless, but we'll use it in the PCIe driver soon. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f31586c46893..ed163c539767 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4899,7 +4899,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->queue_depth = ctrl->sqsize + 1; - set->reserved_tags = NVMF_RESERVED_TAGS; + if (ctrl->ops->flags & NVME_F_FABRICS) + set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) -- cgit v1.2.3 From 93b24f579c392bac2e491fee79ad5ce5a131992e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2022 17:28:48 +0100 Subject: nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set Add the apple shared tag workaround to nvme_alloc_io_tag_set to prepare for using that helper in the PCIe driver. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ed163c539767..9854f5dbcf4a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4899,7 +4899,13 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->queue_depth = ctrl->sqsize + 1; - if (ctrl->ops->flags & NVME_F_FABRICS) + /* + * Some Apple controllers requires tags to be unique across admin and + * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. + */ + if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) + set->reserved_tags = NVME_AQ_DEPTH; + else if (ctrl->ops->flags & NVME_F_FABRICS) set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; -- cgit v1.2.3 From 0da7feaa5913090a2b97177ed3d7fa07b2d1d99c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2022 17:26:25 +0100 Subject: nvme-pci: use the tagset alloc/free helpers Use the common helpers to allocate and free the tagsets. To make this work the generic nvme_ctrl now needs to be stored in the hctx private data instead of the nvme_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 89 ++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 71 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d613b4292c0f..f005e8569266 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -398,7 +398,7 @@ static int nvme_pci_npages_sgl(void) static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); @@ -411,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); @@ -423,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &dev->ctrl; @@ -442,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev) static void nvme_pci_map_queues(struct blk_mq_tag_set *set) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); int i, qoff, offset; offset = queue_irq_offset(dev); @@ -1728,39 +1728,10 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * queue to flush these to completion. */ nvme_unquiesce_admin_queue(&dev->ctrl); - blk_mq_destroy_queue(dev->ctrl.admin_q); - blk_put_queue(dev->ctrl.admin_q); - blk_mq_free_tag_set(&dev->admin_tagset); + nvme_remove_admin_tag_set(&dev->ctrl); } } -static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev) -{ - struct blk_mq_tag_set *set = &dev->admin_tagset; - - set->ops = &nvme_mq_admin_ops; - set->nr_hw_queues = 1; - - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->timeout = NVME_ADMIN_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_NO_SCHED; - set->driver_data = dev; - - if (blk_mq_alloc_tag_set(set)) - return -ENOMEM; - dev->ctrl.admin_tagset = set; - - dev->ctrl.admin_q = blk_mq_init_queue(set); - if (IS_ERR(dev->ctrl.admin_q)) { - blk_mq_free_tag_set(set); - dev->ctrl.admin_q = NULL; - return -ENOMEM; - } - return 0; -} - static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) { return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); @@ -2515,40 +2486,13 @@ static void nvme_delete_io_queues(struct nvme_dev *dev) __nvme_delete_io_queues(dev, nvme_admin_delete_cq); } -static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) +static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev) { - struct blk_mq_tag_set * set = &dev->tagset; - int ret; - - set->ops = &nvme_mq_ops; - set->nr_hw_queues = dev->online_queues - 1; - set->nr_maps = 1; - if (dev->io_queues[HCTX_TYPE_READ]) - set->nr_maps = 2; if (dev->io_queues[HCTX_TYPE_POLL]) - set->nr_maps = 3; - set->timeout = NVME_IO_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_SHOULD_MERGE; - set->driver_data = dev; - - /* - * Some Apple controllers requires tags to be unique - * across admin and IO queue, so reserve the first 32 - * tags of the IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - set->reserved_tags = NVME_AQ_DEPTH; - - ret = blk_mq_alloc_tag_set(set); - if (ret) { - dev_warn(dev->ctrl.device, - "IO queues tagset allocation failed %d\n", ret); - return; - } - dev->ctrl.tagset = set; + return 3; + if (dev->io_queues[HCTX_TYPE_READ]) + return 2; + return 1; } static void nvme_pci_update_nr_queues(struct nvme_dev *dev) @@ -2770,7 +2714,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) - blk_mq_free_tag_set(&dev->tagset); + nvme_remove_io_tag_set(&dev->ctrl); dev->ctrl.tagset = NULL; } @@ -3101,7 +3045,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_release_iod_mempool; - result = nvme_pci_alloc_admin_tag_set(dev); + result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset, + &nvme_mq_admin_ops, sizeof(struct nvme_iod)); if (result) goto out_disable; @@ -3131,12 +3076,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto out_disable; if (dev->online_queues > 1) { - nvme_pci_alloc_tag_set(dev); + nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, + nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); nvme_dbbuf_set(dev); - } else { - dev_warn(dev->ctrl.device, "IO queues not created\n"); } + if (!dev->ctrl.tagset) + dev_warn(dev->ctrl.device, "IO queues not created\n"); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { dev_warn(dev->ctrl.device, "failed to mark controller live state\n"); -- cgit v1.2.3 From 19b00e0069a3819eac0e0ea685899aba29e545d6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 7 Dec 2022 13:28:23 +0200 Subject: nvmet: don't open-code NVME_NS_ATTR_RO enumeration It is already there, just go ahead and use it. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 6b46f90a63cf..53a004ea320c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -561,7 +561,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } if (req->ns->readonly) - id->nsattr |= (1 << 0); + id->nsattr |= NVME_NS_ATTR_RO; done: if (!status) status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); -- cgit v1.2.3