From a6ce7d7b4adaebc27ee7e78e5ecc378a1cfc221d Mon Sep 17 00:00:00 2001 From: Ziye Yang Date: Sat, 22 Aug 2020 00:48:10 +0800 Subject: nvmet-tcp: Fix NULL dereference when a connect data comes in h2cdata pdu When handling commands without in-capsule data, we assign the ttag assuming we already have the queue commands array allocated (based on the queue size information in the connect data payload). However if the connect itself did not send the connect data in-capsule we have yet to allocate the queue commands,and we will assign a bogus ttag and suffer a NULL dereference when we receive the corresponding h2cdata pdu. Fix this by checking if we already allocated commands before dereferencing it when handling h2cdata, if we didn't, its for sure a connect and we should use the preallocated connect command. Signed-off-by: Ziye Yang Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 9eda91162fe4..8e0d766d2722 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -160,6 +160,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { + if (unlikely(!queue->nr_cmds)) { + /* We didn't allocate cmds yet, send 0xffff */ + return USHRT_MAX; + } + return cmd - queue->cmds; } @@ -866,7 +871,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - cmd = &queue->cmds[data->ttag]; + if (likely(queue->nr_cmds)) + cmd = &queue->cmds[data->ttag]; + else + cmd = &queue->connect; if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", -- cgit v1.2.3 From d7144f5c4cf4de95fdc3422943cf51c06aeaf7a7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 14 Aug 2020 11:46:51 -0700 Subject: nvme-fabrics: don't check state NVME_CTRL_NEW for request acceptance NVME_CTRL_NEW should never see any I/O, because in order to start initialization it has to transition to NVME_CTRL_CONNECTING and from there it will never return to this state. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4ec4829d6233..32f61fc5f4c5 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -576,7 +576,6 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, * which is require to set the queue live in the appropinquate states. */ switch (ctrl->state) { - case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: if (nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) -- cgit v1.2.3 From 7cf0d7c0f3c3b0203aaf81c1bc884924d8fdb9bd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 30 Jul 2020 13:24:45 -0700 Subject: nvme: have nvme_wait_freeze_timeout return if it timed out Users can detect if the wait has completed or not and take appropriate actions based on this information (e.g. weather to continue initialization or rather fail and schedule another initialization attempt). Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9826ecf80e2..537dcd900cb5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4534,7 +4534,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unfreeze); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; @@ -4545,6 +4545,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) break; } up_read(&ctrl->namespaces_rwsem); + return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e9cf29449dd1..2910f6caab7d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -605,7 +605,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 -- cgit v1.2.3 From d4d61470ae48838f49e668503e840e1520b97162 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 5 Aug 2020 18:13:48 -0700 Subject: nvme-tcp: serialize controller teardown sequences In the timeout handler we may need to complete a request because the request that timed out may be an I/O that is a part of a serial sequence of controller teardown or initialization. In order to complete the request, we need to fence any other context that may compete with us and complete the request that is timing out. In this case, we could have a potential double completion in case a hard-irq or a different competing context triggered error recovery and is running inflight request cancellation concurrently with the timeout handler. Protect using a ctrl teardown_lock to serialize contexts that may complete a cancelled request due to error recovery or a reset. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3be4749dbf11..d58a6e2a4ab1 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -124,6 +124,7 @@ struct nvme_tcp_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; @@ -1527,7 +1528,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) return; - __nvme_tcp_stop_queue(queue); } @@ -1875,6 +1875,7 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1885,13 +1886,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); if (ctrl->queue_count <= 1) - return; + goto out; + blk_mq_quiesce_queue(ctrl->admin_q); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); @@ -1903,6 +1907,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); +out: + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) @@ -2423,6 +2429,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, nvme_tcp_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = -- cgit v1.2.3 From 236187c4ed195161dfa4237c7beffbba0c5ae45b Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 28 Jul 2020 13:16:36 -0700 Subject: nvme-tcp: fix timeout handler When a request times out in a LIVE state, we simply trigger error recovery and let the error recovery handle the request cancellation, however when a request times out in a non LIVE state, we make sure to complete it immediately as it might block controller setup or teardown and prevent forward progress. However tearing down the entire set of I/O and admin queues causes freeze/unfreeze imbalance (q->mq_freeze_depth) because and is really an overkill to what we actually need, which is to just fence controller teardown that may be running, stop the queue, and cancel the request if it is not already completed. Now that we have the controller teardown_lock, we can safely serialize request cancellation. This addresses a hang caused by calling extra queue freeze on controller namespaces, causing unfreeze to not complete correctly. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 56 +++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 20 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d58a6e2a4ab1..dbb661b899ef 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -465,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->device, "starting error recovery\n"); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } @@ -2155,40 +2156,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) nvme_tcp_queue_request(&ctrl->async_req, true, true); } +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); +} + static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq, bool reserved) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - - dev_warn(ctrl->ctrl.device, + dev_warn(ctrl->device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + if (ctrl->state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } -- cgit v1.2.3 From e5c01f4f7f623e768e868bcc08d8e7ceb03b75d0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 30 Jul 2020 13:25:34 -0700 Subject: nvme-tcp: fix reset hang if controller died in the middle of a reset If the controller becomes unresponsive in the middle of a reset, we will hang because we are waiting for the freeze to complete, but that cannot happen since we have commands that are inflight holding the q_usage_counter, and we can't blindly fail requests that times out. So give a timeout and if we cannot wait for queue freeze before unfreezing, fail and have the error handling take care how to proceed (either schedule a reconnect of remove the controller). Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index dbb661b899ef..873d6afcbc9e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1783,7 +1783,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(ctrl); - nvme_wait_freeze(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->tagset, ctrl->queue_count - 1); nvme_unfreeze(ctrl); @@ -1791,6 +1799,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->connect_q); -- cgit v1.2.3 From 5110f40241d08334375eb0495f174b1d2c07657e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 5 Aug 2020 18:13:58 -0700 Subject: nvme-rdma: serialize controller teardown sequences In the timeout handler we may need to complete a request because the request that timed out may be an I/O that is a part of a serial sequence of controller teardown or initialization. In order to complete the request, we need to fence any other context that may compete with us and complete the request that is timing out. In this case, we could have a potential double completion in case a hard-irq or a different competing context triggered error recovery and is running inflight request cancellation concurrently with the timeout handler. Protect using a ctrl teardown_lock to serialize contexts that may complete a cancelled request due to error recovery or a reset. Reviewed-by: Christoph Hellwig Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4c5660201009..ed387f61f8f7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -122,6 +122,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; @@ -997,6 +998,7 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -1007,11 +1009,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); @@ -1025,6 +1029,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) @@ -2278,6 +2283,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = -- cgit v1.2.3 From 0475a8dcbcee92a5d22e40c9c6353829fc6294b8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 29 Jul 2020 02:36:03 -0700 Subject: nvme-rdma: fix timeout handler When a request times out in a LIVE state, we simply trigger error recovery and let the error recovery handle the request cancellation, however when a request times out in a non LIVE state, we make sure to complete it immediately as it might block controller setup or teardown and prevent forward progress. However tearing down the entire set of I/O and admin queues causes freeze/unfreeze imbalance (q->mq_freeze_depth) because and is really an overkill to what we actually need, which is to just fence controller teardown that may be running, stop the queue, and cancel the request if it is not already completed. Now that we have the controller teardown_lock, we can safely serialize request cancellation. This addresses a hang caused by calling extra queue freeze on controller namespaces, causing unfreeze to not complete correctly. Reviewed-by: Christoph Hellwig Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ed387f61f8f7..cb8731f4b316 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1185,6 +1185,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_reset_wq, &ctrl->err_work); } @@ -1951,6 +1952,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&ctrl->teardown_lock); + nvme_rdma_stop_queue(queue); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&ctrl->teardown_lock); +} + static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq, bool reserved) { @@ -1961,29 +1978,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved) dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", rq->tag, nvme_rdma_queue_idx(queue)); - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_rdma_teardown_io_queues(ctrl, false); - nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ nvme_rdma_error_recovery(ctrl); - return BLK_EH_RESET_TIMER; } -- cgit v1.2.3 From 2362acb6785611eda795bfc12e1ea6b202ecf62c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 30 Jul 2020 13:42:42 -0700 Subject: nvme-rdma: fix reset hang if controller died in the middle of a reset If the controller becomes unresponsive in the middle of a reset, we will hang because we are waiting for the freeze to complete, but that cannot happen since we have commands that are inflight holding the q_usage_counter, and we can't blindly fail requests that times out. So give a timeout and if we cannot wait for queue freeze before unfreezing, fail and have the error handling take care how to proceed (either schedule a reconnect of remove the controller). Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cb8731f4b316..1d8ea5305d60 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -976,7 +976,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(&ctrl->ctrl); - nvme_wait_freeze(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, ctrl->ctrl.queue_count - 1); nvme_unfreeze(&ctrl->ctrl); @@ -984,6 +992,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->ctrl.connect_q); -- cgit v1.2.3 From 7cd49f7576b0c61d6c4a2114cda08cc4d5ce0028 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 24 Aug 2020 15:47:25 -0700 Subject: nvme: Fix NULL dereference for pci nvme controllers PCIe controllers do not have fabric opts, verify they exist before showing ctrl_loss_tmo or reconnect_delay attributes. Fixes: 764075fdcb2f ("nvme: expose reconnect_delay and ctrl_loss_tmo via sysfs") Reported-by: Tobias Markus Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 537dcd900cb5..e406c3cf55bc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3676,6 +3676,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_hostid.attr && !ctrl->opts) return 0; + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) + return 0; return a->mode; } -- cgit v1.2.3 From 70e37988db94aba607d5491a94f80ba08e399b6b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 21 Aug 2020 09:58:19 +0200 Subject: nvmet-fc: Fix a missed _irqsave version of spin_lock in 'nvmet_fc_fod_op_done()' The way 'spin_lock()' and 'spin_lock_irqsave()' are used is not consistent in this function. Use 'spin_lock_irqsave()' also here, as there is no guarantee that interruptions are disabled at that point, according to surrounding code. Fixes: a97ec51b37ef ("nvmet_fc: Rework target side abort handling") Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 55bafd56166a..e6861cc10e7d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) return; if (fcpreq->fcp_error || fcpreq->transferred_length != fcpreq->transfer_length) { - spin_lock(&fod->flock); + spin_lock_irqsave(&fod->flock, flags); fod->abort = true; - spin_unlock(&fod->flock); + spin_unlock_irqrestore(&fod->flock, flags); nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; -- cgit v1.2.3 From 192f6c29bb28bfd0a17e6ad331d09f1ec84143d0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 26 Aug 2020 10:53:04 -0700 Subject: nvme: fix controller instance leak If the driver has to unbind from the controller for an early failure before the subsystem has been set up, there won't be a subsystem holding the controller's instance, so the controller needs to free its own instance in this case. Fixes: 733e4b69d508d ("nvme: Assign subsys instance from first ctrl") Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e406c3cf55bc..d6186208abf9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4394,7 +4394,7 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; struct nvme_cel *cel, *next; - if (subsys && ctrl->instance != subsys->instance) + if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { -- cgit v1.2.3 From e83d776f9f98b4af18d67f05f9d1f3042dbe62c7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Aug 2020 10:38:57 -0700 Subject: nvme: only use power of two io boundaries The kernel requires a power of two for boundaries because that's the only way it can efficiently split commands that cross them. A controller, however, may report a non-power of two boundary. The driver had been rounding the controller's value to one the kernel can use, but splitting on the wrong boundary provides no benefit on the device side, and incurs additional submission overhead from non-optimal splits. Don't provide any boundary hint if the controller's value can't be used and log a warning when first scanning a disk's unreported IO boundary. Since the chunk sector logic has grown, move it to a separate function. Cc: Martin K. Petersen Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6186208abf9..5702a3843746 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2026,13 +2026,49 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_unfreeze_queue(disk->queue); } +static inline bool nvme_first_scan(struct gendisk *disk) +{ + /* nvme_alloc_ns() scans the disk prior to adding it */ + return !(disk->flags & GENHD_FL_UP); +} + +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + u32 iob; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && + is_power_of_2(ctrl->max_hw_sectors)) + iob = ctrl->max_hw_sectors; + else + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + + if (!iob) + return; + + if (!is_power_of_2(iob)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring unaligned IO boundary:%u\n", + ns->disk->disk_name, iob); + return; + } + + if (blk_queue_is_zoned(ns->disk->queue)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring zoned namespace IO boundary\n", + ns->disk->disk_name); + return; + } + + blk_queue_chunk_sectors(ns->queue, iob); +} + static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; int ret; - u32 iob; /* * If identify namespace failed, use default 512 byte block size so @@ -2060,12 +2096,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return -ENODEV; } - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && - is_power_of_2(ctrl->max_hw_sectors)) - iob = ctrl->max_hw_sectors; - else - iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); - ns->features = 0; ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ @@ -2097,8 +2127,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) } } - if (iob && !blk_queue_is_zoned(ns->queue)) - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); + nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { -- cgit v1.2.3 From 7ad92f656bddff4cf8f641e0e3b1acd4eb9644cb Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Fri, 28 Aug 2020 10:17:08 -0400 Subject: nvme-pci: cancel nvme device request before disabling This patch addresses an irq free warning and null pointer dereference error problem when nvme devices got timeout error during initialization. This problem happens when nvme_timeout() function is called while nvme_reset_work() is still in execution. This patch fixed the problem by setting flag of the problematic request to NVME_REQ_CANCELLED before calling nvme_dev_disable() to make sure __nvme_submit_sync_cmd() returns an error code and let nvme_submit_sync_cmd() fail gracefully. The following is console output. [ 62.472097] nvme nvme0: I/O 13 QID 0 timeout, disable controller [ 62.488796] nvme nvme0: could not set timestamp (881) [ 62.494888] ------------[ cut here ]------------ [ 62.495142] Trying to free already-free IRQ 11 [ 62.495366] WARNING: CPU: 0 PID: 7 at kernel/irq/manage.c:1751 free_irq+0x1f7/0x370 [ 62.495742] Modules linked in: [ 62.495902] CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.8.0+ #8 [ 62.496206] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812dda519-p4 [ 62.496772] Workqueue: nvme-reset-wq nvme_reset_work [ 62.497019] RIP: 0010:free_irq+0x1f7/0x370 [ 62.497223] Code: e8 ce 49 11 00 48 83 c4 08 4c 89 e0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 44 89 f6 48 c70 [ 62.498133] RSP: 0000:ffffa96800043d40 EFLAGS: 00010086 [ 62.498391] RAX: 0000000000000000 RBX: ffff9b87fc458400 RCX: 0000000000000000 [ 62.498741] RDX: 0000000000000001 RSI: 0000000000000096 RDI: ffffffff9693d72c [ 62.499091] RBP: ffff9b87fd4c8f60 R08: ffffa96800043bfd R09: 0000000000000163 [ 62.499440] R10: ffffa96800043bf8 R11: ffffa96800043bfd R12: ffff9b87fd4c8e00 [ 62.499790] R13: ffff9b87fd4c8ea4 R14: 000000000000000b R15: ffff9b87fd76b000 [ 62.500140] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 62.500534] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 62.500816] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 62.501165] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 62.501515] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 62.501864] Call Trace: [ 62.501993] pci_free_irq+0x13/0x20 [ 62.502167] nvme_reset_work+0x5d0/0x12a0 [ 62.502369] ? update_load_avg+0x59/0x580 [ 62.502569] ? ttwu_queue_wakelist+0xa8/0xc0 [ 62.502780] ? try_to_wake_up+0x1a2/0x450 [ 62.502979] process_one_work+0x1d2/0x390 [ 62.503179] worker_thread+0x45/0x3b0 [ 62.503361] ? process_one_work+0x390/0x390 [ 62.503568] kthread+0xf9/0x130 [ 62.503726] ? kthread_park+0x80/0x80 [ 62.503911] ret_from_fork+0x22/0x30 [ 62.504090] ---[ end trace de9ed4a70f8d71e2 ]--- [ 123.912275] nvme nvme0: I/O 12 QID 0 timeout, disable controller [ 123.914670] nvme nvme0: 1/0/0 default/read/poll queues [ 123.916310] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 123.917469] #PF: supervisor write access in kernel mode [ 123.917725] #PF: error_code(0x0002) - not-present page [ 123.917976] PGD 0 P4D 0 [ 123.918109] Oops: 0002 [#1] SMP PTI [ 123.918283] CPU: 0 PID: 7 Comm: kworker/u4:0 Tainted: G W 5.8.0+ #8 [ 123.918650] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812dda519-p4 [ 123.919219] Workqueue: nvme-reset-wq nvme_reset_work [ 123.919469] RIP: 0010:__blk_mq_alloc_map_and_request+0x21/0x80 [ 123.919757] Code: 66 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 63 ee 53 48 8b 47 68 89 ee 48 89 fb 8b4 [ 123.920657] RSP: 0000:ffffa96800043d40 EFLAGS: 00010286 [ 123.920912] RAX: ffff9b87fc4fee40 RBX: ffff9b87fc8cb008 RCX: 0000000000000000 [ 123.921258] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9b87fc618000 [ 123.921602] RBP: 0000000000000000 R08: ffff9b87fdc2c4a0 R09: ffff9b87fc616000 [ 123.921949] R10: 0000000000000000 R11: ffff9b87fffd1500 R12: 0000000000000000 [ 123.922295] R13: 0000000000000000 R14: ffff9b87fc8cb200 R15: ffff9b87fc8cb000 [ 123.922641] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 123.923032] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 123.923312] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 123.923660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 123.924007] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 123.924353] Call Trace: [ 123.924479] blk_mq_alloc_tag_set+0x137/0x2a0 [ 123.924694] nvme_reset_work+0xed6/0x12a0 [ 123.924898] process_one_work+0x1d2/0x390 [ 123.925099] worker_thread+0x45/0x3b0 [ 123.925280] ? process_one_work+0x390/0x390 [ 123.925486] kthread+0xf9/0x130 [ 123.925642] ? kthread_park+0x80/0x80 [ 123.925825] ret_from_fork+0x22/0x30 [ 123.926004] Modules linked in: [ 123.926158] CR2: 0000000000000000 [ 123.926322] ---[ end trace de9ed4a70f8d71e3 ]--- [ 123.926549] RIP: 0010:__blk_mq_alloc_map_and_request+0x21/0x80 [ 123.926832] Code: 66 0f 1f 84 00 00 00 00 00 41 55 41 54 55 48 63 ee 53 48 8b 47 68 89 ee 48 89 fb 8b4 [ 123.927734] RSP: 0000:ffffa96800043d40 EFLAGS: 00010286 [ 123.927989] RAX: ffff9b87fc4fee40 RBX: ffff9b87fc8cb008 RCX: 0000000000000000 [ 123.928336] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9b87fc618000 [ 123.928679] RBP: 0000000000000000 R08: ffff9b87fdc2c4a0 R09: ffff9b87fc616000 [ 123.929025] R10: 0000000000000000 R11: ffff9b87fffd1500 R12: 0000000000000000 [ 123.929370] R13: 0000000000000000 R14: ffff9b87fc8cb200 R15: ffff9b87fc8cb000 [ 123.929715] FS: 0000000000000000(0000) GS:ffff9b87fdc00000(0000) knlGS:0000000000000000 [ 123.930106] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 123.930384] CR2: 0000000000000000 CR3: 000000003aa0a000 CR4: 00000000000006f0 [ 123.930731] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 123.931077] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Co-developed-by: Keith Busch Signed-off-by: Tong Zhang Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b673fa4cf5ea..5e07d5628864 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1249,8 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; @@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; } -- cgit v1.2.3