From ad6a0a52e6de3d1161b7999c7903db906ba4cf79 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:24 +0200 Subject: nvme: rename NVME_CTRL_RECONNECTING state to NVME_CTRL_CONNECTING In pci transport, this state is used to mark the initialization process. This should be also used in other transports as well. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/fabrics.h | 9 +++++---- drivers/nvme/host/fc.c | 14 +++++++------- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 8 ++++---- drivers/nvme/host/rdma.c | 6 +++--- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f431c32774f3..1033de4136e0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -265,7 +265,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_ADMIN_ONLY: switch (old_state) { - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -276,7 +276,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -294,7 +294,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: switch (old_state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: @@ -309,7 +309,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_LIVE: case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -2687,7 +2687,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_LIVE] = "live", [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", - [NVME_CTRL_RECONNECTING]= "reconnecting", + [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", [NVME_CTRL_DEAD] = "dead", }; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 25b19f722f5b..a3145d90c1d2 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -171,13 +171,14 @@ static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl, cmd->common.opcode != nvme_fabrics_command || cmd->fabrics.fctype != nvme_fabrics_type_connect) { /* - * Reconnecting state means transport disruption, which can take - * a long time and even might fail permanently, fail fast to - * give upper layers a chance to failover. + * Connecting state means transport disruption or initial + * establishment, which can take a long time and even might + * fail permanently, fail fast to give upper layers a chance + * to failover. * Deleting state means that the ctrl will never accept commands * again, fail it permanently. */ - if (ctrl->state == NVME_CTRL_RECONNECTING || + if (ctrl->state == NVME_CTRL_CONNECTING || ctrl->state == NVME_CTRL_DELETING) { nvme_req(rq)->status = NVME_SC_ABORT_REQ; return BLK_STS_IOERR; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b856d7c919d2..e2df22d56b2a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -532,7 +532,7 @@ nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) { switch (ctrl->ctrl.state) { case NVME_CTRL_NEW: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: /* * As all reconnects were suppressed, schedule a * connect. @@ -777,7 +777,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) } break; - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: /* * The association has already been terminated and the * controller is attempting reconnects. No need to do anything @@ -1722,7 +1722,7 @@ done: if (status && (blk_queue_dying(rq->q) || ctrl->ctrl.state == NVME_CTRL_NEW || - ctrl->ctrl.state == NVME_CTRL_RECONNECTING)) + ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) @@ -2943,7 +2943,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; bool recon = true; - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) return; if (portptr->port_state == FC_OBJSTATE_ONLINE) @@ -2991,10 +2991,10 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " - "to RECONNECTING\n", ctrl->cnum); + "to CONNECTING\n", ctrl->cnum); return; } @@ -3195,7 +3195,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, * transport errors (frame drop, LS failure) inherently must kill * the association. The transport is coded so that any command used * to create the association (prior to a LIVE state transition - * while NEW or RECONNECTING) will fail if it completes in error or + * while NEW or CONNECTING) will fail if it completes in error or * times out. * * As such: as the connect request was mostly likely due to a diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8e4550fa08f8..27e31c00b306 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -123,7 +123,7 @@ enum nvme_ctrl_state { NVME_CTRL_LIVE, NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, - NVME_CTRL_RECONNECTING, + NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, NVME_CTRL_DEAD, }; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6fe7af00a1f4..ab9c19525fa8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1141,7 +1141,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) /* If there is a reset/reinit ongoing, we shouldn't reset again. */ switch (dev->ctrl.state) { case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: return false; default: break; @@ -2288,12 +2288,12 @@ static void nvme_reset_work(struct work_struct *work) nvme_dev_disable(dev, false); /* - * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the + * Introduce CONNECTING state from nvme-fc/rdma transports to mark the * initializing procedure here. */ - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(dev->ctrl.device, - "failed to mark controller RECONNECTING\n"); + "failed to mark controller CONNECTING\n"); goto out; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2bc059f7d73c..050eaa24cc7d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -887,7 +887,7 @@ free_ctrl: static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) { /* If we are resetting/deleting then do nothing */ - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || ctrl->ctrl.state == NVME_CTRL_LIVE); return; @@ -973,7 +973,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure should never happen */ WARN_ON_ONCE(1); return; @@ -1756,7 +1756,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure should never happen */ WARN_ON_ONCE(1); return; -- cgit v1.2.3 From b754a32c66772e6510acd92237aadd4cf227ae39 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:25 +0200 Subject: nvme-rdma: use NVME_CTRL_CONNECTING state to mark init process In order to avoid concurrent error recovery during initialization process (allowed by the NVME_CTRL_NEW --> NVME_CTRL_RESETTING transition) we must mark the ctrl as CONNECTING before initial connection establisment. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/rdma.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1033de4136e0..86dca2919e19 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -296,6 +296,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; case NVME_CTRL_CONNECTING: switch (old_state) { + case NVME_CTRL_NEW: case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: changed = true; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 050eaa24cc7d..5e2cc4f0d207 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1942,6 +1942,9 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, if (!ctrl->queues) goto out_uninit_ctrl; + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); + WARN_ON_ONCE(!changed); + ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) goto out_kfree_queues; -- cgit v1.2.3 From 3096a739d2ccbfd6a626e388228a16558f76d79d Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:26 +0200 Subject: nvme: delete NVME_CTRL_LIVE --> NVME_CTRL_CONNECTING transition There is no logical reason to move from live state to connecting state. In case of initial connection establishment, the transition should be NVME_CTRL_NEW --> NVME_CTRL_CONNECTING --> NVME_CTRL_LIVE. In case of error recovery or reset, the transition should be NVME_CTRL_LIVE --> NVME_CTRL_RESETTING --> NVME_CTRL_CONNECTING --> NVME_CTRL_LIVE. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 86dca2919e19..1f9278364196 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -297,7 +297,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_CONNECTING: switch (old_state) { case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: changed = true; /* FALLTHRU */ -- cgit v1.2.3 From 8cb6af7b3a6d47f95ecb461a3f8d39cf6a64e4ae Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 31 Jan 2018 17:01:58 -0700 Subject: nvme: Fix discard buffer overrun This patch checks the discard range array bounds before setting it in case the driver gets a badly formed request. Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1f9278364196..2fd8688cfa47 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -518,9 +518,11 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } n++; } -- cgit v1.2.3 From 3efd6e8ebe19f0774c82de582849539b60cc4d97 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 6 Feb 2018 06:48:29 -0800 Subject: nvme_fc: correct abort race condition on resets During reset handling, there is live io completing while the reset is taking place. The reset path attempts to abort all outstanding io, counting the number of ios that were reset. It then waits for those ios to be reclaimed from the lldd before continuing. The transport's logic on io state and flag setting was poor, allowing ios to complete simultaneous to the abort request. The completed ios were counted, but as the completion had already occurred, the completion never reduced the count. As the count never zeros, the reset/delete never completes. Tighten it up by unconditionally changing the op state to completed when the io done handler is called. The reset/abort path now changes the op state to aborted, but the abort only continues if the op state was live priviously. If complete, the abort is backed out. Thus proper counting of io aborts and their completions is working again. Also removed the TERMIO state on the op as it's redundant with the op's aborted state. Reviewed-by: Johannes Thumshirn Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 98 ++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 72 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e2df22d56b2a..4673882ce152 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1512,13 +1512,19 @@ nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq, static int __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) { - int state; + unsigned long flags; + int opstate; + + spin_lock_irqsave(&ctrl->lock, flags); + opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); + if (opstate != FCPOP_STATE_ACTIVE) + atomic_set(&op->state, opstate); + else if (ctrl->flags & FCCTRL_TERMIO) + ctrl->iocnt++; + spin_unlock_irqrestore(&ctrl->lock, flags); - state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); - if (state != FCPOP_STATE_ACTIVE) { - atomic_set(&op->state, state); + if (opstate != FCPOP_STATE_ACTIVE) return -ECANCELED; - } ctrl->lport->ops->fcp_abort(&ctrl->lport->localport, &ctrl->rport->remoteport, @@ -1532,52 +1538,23 @@ static void nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) { struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops; - unsigned long flags; - int i, ret; - - for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { - if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) - continue; - - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) { - ctrl->iocnt++; - aen_op->flags |= FCOP_FLAGS_TERMIO; - } - spin_unlock_irqrestore(&ctrl->lock, flags); - - ret = __nvme_fc_abort_op(ctrl, aen_op); - if (ret) { - /* - * if __nvme_fc_abort_op failed the io wasn't - * active. Thus this call path is running in - * parallel to the io complete. Treat as non-error. - */ + int i; - /* back out the flags/counters */ - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; - aen_op->flags &= ~FCOP_FLAGS_TERMIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - return; - } - } + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) + __nvme_fc_abort_op(ctrl, aen_op); } static inline int __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, - struct nvme_fc_fcp_op *op) + struct nvme_fc_fcp_op *op, int opstate) { unsigned long flags; bool complete_rq = false; spin_lock_irqsave(&ctrl->lock, flags); - if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - if (ctrl->flags & FCCTRL_TERMIO) { - if (!--ctrl->iocnt) - wake_up(&ctrl->ioabort_wait); - } + if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); } if (op->flags & FCOP_FLAGS_RELEASED) complete_rq = true; @@ -1601,6 +1578,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); union nvme_result result; bool terminate_assoc = true; + int opstate; /* * WARNING: @@ -1639,11 +1617,12 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * association to be terminated. */ + opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE); + fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, sizeof(op->rsp_iu), DMA_FROM_DEVICE); - if (atomic_read(&op->state) == FCPOP_STATE_ABORTED || - op->flags & FCOP_FLAGS_TERMIO) + if (opstate == FCPOP_STATE_ABORTED) status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); else if (freq->status) status = cpu_to_le16(NVME_SC_INTERNAL << 1); @@ -1708,7 +1687,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) done: if (op->flags & FCOP_FLAGS_AEN) { nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); - __nvme_fc_fcpop_chk_teardowns(ctrl, op); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); atomic_set(&op->state, FCPOP_STATE_IDLE); op->flags = FCOP_FLAGS_AEN; /* clear other flags */ nvme_fc_ctrl_put(ctrl); @@ -1725,7 +1704,7 @@ done: ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); - if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate)) __nvme_fc_final_op_cleanup(rq); else nvme_end_request(rq, status, result); @@ -2421,8 +2400,7 @@ __nvme_fc_final_op_cleanup(struct request *rq) struct nvme_fc_ctrl *ctrl = op->ctrl; atomic_set(&op->state, FCPOP_STATE_IDLE); - op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED | - FCOP_FLAGS_COMPLETE); + op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); @@ -2476,35 +2454,11 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_ctrl *nctrl = data; struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); - unsigned long flags; - int status; if (!blk_mq_request_started(req)) return; - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) { - ctrl->iocnt++; - op->flags |= FCOP_FLAGS_TERMIO; - } - spin_unlock_irqrestore(&ctrl->lock, flags); - - status = __nvme_fc_abort_op(ctrl, op); - if (status) { - /* - * if __nvme_fc_abort_op failed the io wasn't - * active. Thus this call path is running in - * parallel to the io complete. Treat as non-error. - */ - - /* back out the flags/counters */ - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; - op->flags &= ~FCOP_FLAGS_TERMIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - return; - } + __nvme_fc_abort_op(ctrl, op); } -- cgit v1.2.3 From c3aedd225f8bcc3b3e61df074bc045b80542b38a Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 6 Feb 2018 06:48:30 -0800 Subject: nvme_fc: cleanup io completion There was some old cold that dealt with complete_rq being called prior to the lldd returning the io completion. This is garbage code. The complete_rq routine was being called after eh_timeouts were called and it was due to eh_timeouts not being handled properly. The timeouts were fixed in prior patches so that in general, a timeout will initiate an abort and the reset timer restarted as the abort operation will take care of completing things. Given the reset timer restarted, the erroneous complete_rq calls were eliminated. So remove the work that was synchronizing complete_rq with io completion. Reviewed-by: Johannes Thumshirn Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 63 ++++++++++---------------------------------------- 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 4673882ce152..7f51f8414b97 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -55,9 +55,7 @@ struct nvme_fc_queue { enum nvme_fcop_flags { FCOP_FLAGS_TERMIO = (1 << 0), - FCOP_FLAGS_RELEASED = (1 << 1), - FCOP_FLAGS_COMPLETE = (1 << 2), - FCOP_FLAGS_AEN = (1 << 3), + FCOP_FLAGS_AEN = (1 << 1), }; struct nvmefc_ls_req_op { @@ -1470,7 +1468,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) /* *********************** NVME Ctrl Routines **************************** */ -static void __nvme_fc_final_op_cleanup(struct request *rq); static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); static int @@ -1544,25 +1541,20 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) __nvme_fc_abort_op(ctrl, aen_op); } -static inline int +static inline void __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op, int opstate) { unsigned long flags; - bool complete_rq = false; - spin_lock_irqsave(&ctrl->lock, flags); - if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) { - if (!--ctrl->iocnt) - wake_up(&ctrl->ioabort_wait); + if (opstate == FCPOP_STATE_ABORTED) { + spin_lock_irqsave(&ctrl->lock, flags); + if (ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } + spin_unlock_irqrestore(&ctrl->lock, flags); } - if (op->flags & FCOP_FLAGS_RELEASED) - complete_rq = true; - else - op->flags |= FCOP_FLAGS_COMPLETE; - spin_unlock_irqrestore(&ctrl->lock, flags); - - return complete_rq; } static void @@ -1704,10 +1696,8 @@ done: ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); - if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate)) - __nvme_fc_final_op_cleanup(rq); - else - nvme_end_request(rq, status, result); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); + nvme_end_request(rq, status, result); check_error: if (terminate_assoc) @@ -2394,45 +2384,16 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg) } static void -__nvme_fc_final_op_cleanup(struct request *rq) +nvme_fc_complete_rq(struct request *rq) { struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); struct nvme_fc_ctrl *ctrl = op->ctrl; atomic_set(&op->state, FCPOP_STATE_IDLE); - op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); - -} - -static void -nvme_fc_complete_rq(struct request *rq) -{ - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_ctrl *ctrl = op->ctrl; - unsigned long flags; - bool completed = false; - - /* - * the core layer, on controller resets after calling - * nvme_shutdown_ctrl(), calls complete_rq without our - * calling blk_mq_complete_request(), thus there may still - * be live i/o outstanding with the LLDD. Means transport has - * to track complete calls vs fcpio_done calls to know what - * path to take on completes and dones. - */ - spin_lock_irqsave(&ctrl->lock, flags); - if (op->flags & FCOP_FLAGS_COMPLETE) - completed = true; - else - op->flags |= FCOP_FLAGS_RELEASED; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (completed) - __nvme_fc_final_op_cleanup(rq); } /* -- cgit v1.2.3 From 0a34e4668c508cbbc2d5ef2d9710b145e4c0b27d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 11 Jan 2018 13:38:15 -0800 Subject: nvme: Don't use a stack buffer for keep-alive command In nvme_keep_alive() we pass a request with a pointer to an NVMe command on the stack into blk_execute_rq_nowait(). However, the block layer doesn't guarantee that the request is fully queued before blk_execute_rq_nowait() returns. If not, and the request is queued after nvme_keep_alive() returns, then we'll end up using stack memory that might have been overwritten to form the NVMe command we pass to hardware. Fix this by keeping a special command struct in the nvme_ctrl struct right next to the delayed work struct used for keep-alives. Signed-off-by: Roland Dreier Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++----- drivers/nvme/host/nvme.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fd8688cfa47..6d0490b477c9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -796,13 +796,9 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) static int nvme_keep_alive(struct nvme_ctrl *ctrl) { - struct nvme_command c; struct request *rq; - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_keep_alive; - - rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, NVME_QID_ANY); if (IS_ERR(rq)) return PTR_ERR(rq); @@ -834,6 +830,8 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl) return; INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } EXPORT_SYMBOL_GPL(nvme_start_keep_alive); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 27e31c00b306..0521e4707d1c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -183,6 +183,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct nvme_command ka_cmd; struct work_struct fw_act_work; /* Power saving configuration */ -- cgit v1.2.3 From 67b4110f8c8d16e588d7730db8e8b01b32c1bd8b Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Tue, 13 Feb 2018 21:18:12 +0530 Subject: blk: optimization for classic polling This removes the dependency on interrupts to wake up task. Set task state as TASK_RUNNING, if need_resched() returns true, while polling for IO completion. Earlier, polling task used to sleep, relying on interrupt to wake it up. This made some IO take very long when interrupt-coalescing is enabled in NVMe. Reference: http://lists.infradead.org/pipermail/linux-nvme/2018-February/015435.html Changes since v2->v3: -using __set_current_state() instead of set_current_state() Changes since v1->v2: -setting task state once in blk_poll, instead of multiple callers. Signed-off-by: Nitesh Shetty Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index df93102e2149..357492712b0e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3164,6 +3164,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) cpu_relax(); } + __set_current_state(TASK_RUNNING); return false; } -- cgit v1.2.3 From 3fd176b754e992e1cdf1693ea8184626d1ed7671 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Mon, 12 Feb 2018 20:54:45 +0800 Subject: nvme: fix the deadlock in nvme_update_formats nvme_update_formats will invoke nvme_ns_remove under namespaces_mutext. The will cause deadlock because nvme_ns_remove will also require the namespaces_mutext. Fix it by getting the ns entries which should be removed under namespaces_mutext and invoke nvme_ns_remove out of namespaces_mutext. Signed-off-by: Jianchao Wang Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d0490b477c9..52b3626fb64e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1117,14 +1117,19 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, static void nvme_update_formats(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; + struct nvme_ns *ns, *next; + LIST_HEAD(rm_list); mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->disk && nvme_revalidate_disk(ns->disk)) - nvme_ns_remove(ns); + if (ns->disk && nvme_revalidate_disk(ns->disk)) { + list_move_tail(&ns->list, &rm_list); + } } mutex_unlock(&ctrl->namespaces_mutex); + + list_for_each_entry_safe(ns, next, &rm_list, list) + nvme_ns_remove(ns); } static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -- cgit v1.2.3 From 815c6704bf9f1c59f3a6be380a4032b9c57b12f1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 Feb 2018 05:44:44 -0700 Subject: nvme-pci: Remap CMB SQ entries on every controller reset The controller memory buffer is remapped into a kernel address on each reset, but the driver was setting the submission queue base address only on the very first queue creation. The remapped address is likely to change after a reset, so accessing the old address will hit a kernel bug. This patch fixes that by setting the queue's CMB base address each time the queue is created. Fixes: f63572dff1421 ("nvme: unmap CMB and remove sysfs file in reset path") Reported-by: Christian Black Cc: Jon Derrick Cc: # 4.9+ Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ab9c19525fa8..b427157af74e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1364,18 +1364,14 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), - dev->ctrl.page_size); - nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; - nvmeq->sq_cmds_io = dev->cmb + offset; - } else { - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - return -ENOMEM; - } + /* CMB SQEs will be mapped before creation */ + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) + return 0; + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + return -ENOMEM; return 0; } @@ -1449,6 +1445,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) struct nvme_dev *dev = nvmeq->dev; int result; + if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { + unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), + dev->ctrl.page_size); + nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; + nvmeq->sq_cmds_io = dev->cmb + offset; + } + nvmeq->cq_vector = qid - 1; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) -- cgit v1.2.3 From 4244140d7b8f406b7edfd01c050dea783aa1efc5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 8 Feb 2018 08:55:34 -0700 Subject: nvme-pci: Fix timeouts in connecting state We need to halt the controller immediately if we haven't completed initialization as indicated by the new "connecting" state. Fixes: ad70062cdb ("nvme-pci: introduce RECONNECTING state to mark initializing procedure") Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b427157af74e..73036d2fbbd5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1215,13 +1215,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_HANDLED. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) { + switch (dev->ctrl.state) { + case NVME_CTRL_CONNECTING: + case NVME_CTRL_RESETTING: dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_HANDLED; + default: + break; } /* -- cgit v1.2.3 From 7756f72ccd4359c6df61fc431cd3b5b0a8639837 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 30 Jan 2018 10:07:01 +0000 Subject: nvmet: Change return code of discard command if not supported Execute discard command on block device that doesn't support it should return success. Returning internal error while using multi-path fails the path. Reviewed-by: Max Gurtovoy Signed-off-by: Israel Rukshin Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0a4372a016f2..28bbdff4a88b 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -105,10 +105,13 @@ static void nvmet_execute_flush(struct nvmet_req *req) static u16 nvmet_discard_range(struct nvmet_ns *ns, struct nvme_dsm_range *range, struct bio **bio) { - if (__blkdev_issue_discard(ns->bdev, + int ret; + + ret = __blkdev_issue_discard(ns->bdev, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), - GFP_KERNEL, 0, bio)) + GFP_KERNEL, 0, bio); + if (ret && ret != -EOPNOTSUPP) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } -- cgit v1.2.3 From 8000d1fdb07e365e6565c2415aefdfed15413794 Mon Sep 17 00:00:00 2001 From: Nitzan Carmi Date: Wed, 17 Jan 2018 11:01:14 +0000 Subject: nvme-rdma: fix sysfs invoked reset_ctrl error flow When reset_controller that is invoked by sysfs fails, it enters an error flow which practically removes the nvme ctrl entirely (similar to delete_ctrl flow). It causes the system to hang, since a sysfs attribute cannot be unregistered by one of its own methods. This can be fixed by calling delete_ctrl as a work rather than sequential code. In addition, it should give the ctrl a chance to recover using reconnection mechanism (consistant with FC reset_ctrl error flow). Also, while we're here, return suitable errno in case the reset ended with non live ctrl. Signed-off-by: Nitzan Carmi Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/rdma.c | 7 ++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 52b3626fb64e..0fe7ea35c221 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -120,8 +120,12 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) int ret; ret = nvme_reset_ctrl(ctrl); - if (!ret) + if (!ret) { flush_work(&ctrl->reset_work); + if (ctrl->state != NVME_CTRL_LIVE) + ret = -ENETRESET; + } + return ret; } EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5e2cc4f0d207..3a51ed50eff2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1784,11 +1784,8 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; out_fail: - dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl, true); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); + ++ctrl->ctrl.nr_reconnects; + nvme_rdma_reconnect_or_remove(ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { -- cgit v1.2.3 From 096392e0714d3a520366ba467e215edf7280acff Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 15 Feb 2018 23:53:17 +0900 Subject: block: fix a typo in comment of BLK_MQ_POLL_STATS_BKTS Update comment typo _consisitent_ to _consistent_ from following commit. commit 0206319fdfee ("blk-mq: Fix poll_stat for new size-based bucketing.") Cc: Jens Axboe Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4f3df807cf8f..ed63f3b69c12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -49,7 +49,7 @@ struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ -/* Must be consisitent with blk_mq_poll_stats_bkt() */ +/* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 /* -- cgit v1.2.3