summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/rdma.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r--drivers/nvme/host/rdma.c196
1 files changed, 108 insertions, 88 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 11a5ecae78c8..97f668a39ae1 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
if (!ring)
return NULL;
+ /*
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
+ * lifetime. It's safe, since any chage in the underlying RDMA device
+ * will issue error recovery and queue re-creation.
+ */
for (i = 0; i < ib_queue_size; i++) {
if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
goto out_free_ring;
@@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
+ kfree(req->sqe.data);
}
static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
- int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl;
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
- if (ret)
- return ret;
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
+ if (!req->sqe.data)
+ return -ENOMEM;
req->queue = queue;
@@ -641,34 +637,16 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
struct ib_device *ibdev = ctrl->device->dev;
- unsigned int nr_io_queues;
+ unsigned int nr_io_queues, nr_default_queues;
+ unsigned int nr_read_queues, nr_poll_queues;
int i, ret;
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
-
- /*
- * we map queues according to the device irq vectors for
- * optimal locality so we don't need more queues than
- * completion vectors.
- */
- nr_io_queues = min_t(unsigned int, nr_io_queues,
- ibdev->num_comp_vectors);
-
- if (opts->nr_write_queues) {
- ctrl->io_queues[HCTX_TYPE_DEFAULT] =
- min(opts->nr_write_queues, nr_io_queues);
- nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT];
- } else {
- ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
- }
-
- ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues;
-
- if (opts->nr_poll_queues) {
- ctrl->io_queues[HCTX_TYPE_POLL] =
- min(opts->nr_poll_queues, num_online_cpus());
- nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL];
- }
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_io_queues, num_online_cpus()));
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_write_queues, num_online_cpus()));
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
@@ -681,6 +659,34 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_default_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_read_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(nr_poll_queues, nr_io_queues);
+ }
+
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_rdma_alloc_queue(ctrl, i,
ctrl->ctrl.sqsize + 1);
@@ -697,15 +703,6 @@ out_free_queues:
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
- struct blk_mq_tag_set *set)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- blk_mq_free_tag_set(set);
- nvme_rdma_dev_put(ctrl->device);
-}
-
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
{
@@ -744,24 +741,9 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
ret = blk_mq_alloc_tag_set(set);
if (ret)
- goto out;
-
- /*
- * We need a reference on the device as long as the tag_set is alive,
- * as the MRs in the request structures need a valid ib_device.
- */
- ret = nvme_rdma_dev_get(ctrl->device);
- if (!ret) {
- ret = -EINVAL;
- goto out_free_tagset;
- }
+ return ERR_PTR(ret);
return set;
-
-out_free_tagset:
- blk_mq_free_tag_set(set);
-out:
- return ERR_PTR(ret);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -769,7 +751,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -793,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+ /*
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
+ * It's safe, since any chage in the underlying RDMA device will issue
+ * error recovery and queue re-creation.
+ */
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
if (error)
@@ -847,7 +834,7 @@ out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
@@ -862,7 +849,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
@@ -903,7 +890,7 @@ out_cleanup_connect_q:
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
@@ -914,8 +901,9 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.admin_tagset)
+ blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
+ nvme_cancel_request, &ctrl->ctrl);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -926,8 +914,9 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.tagset)
+ blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
+ nvme_cancel_request, &ctrl->ctrl);
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
@@ -1731,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev;
+
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
+ sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
+ if (unlikely(err))
+ return BLK_STS_RESOURCE;
+
ib_dma_sync_single_for_cpu(dev, sqe->dma,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ret = nvme_setup_cmd(ns, rq, c);
if (ret)
- return ret;
+ goto unmap_qe;
blk_mq_start_request(rq);
@@ -1761,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
}
return BLK_STS_OK;
+
err:
if (err == -ENOMEM || err == -EAGAIN)
- return BLK_STS_RESOURCE;
- return BLK_STS_IOERR;
+ ret = BLK_STS_RESOURCE;
+ else
+ ret = BLK_STS_IOERR;
+unmap_qe:
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ return ret;
}
static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
@@ -1777,25 +1780,36 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct ib_device *ibdev = queue->device->dev;
- nvme_rdma_unmap_data(req->queue, rq);
+ nvme_rdma_unmap_data(queue, rq);
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
nvme_complete_rq(rq);
}
static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
- set->map[HCTX_TYPE_DEFAULT].nr_queues =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
- set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ];
- if (ctrl->ctrl.opts->nr_write_queues) {
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
/* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
set->map[HCTX_TYPE_READ].queue_offset =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
} else {
- /* mixed read/write queues */
+ /* shared read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_READ].queue_offset = 0;
}
blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
@@ -1803,16 +1817,22 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
ctrl->device->dev, 0);
- if (ctrl->ctrl.opts->nr_poll_queues) {
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
set->map[HCTX_TYPE_POLL].nr_queues =
ctrl->io_queues[HCTX_TYPE_POLL];
set->map[HCTX_TYPE_POLL].queue_offset =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
- if (ctrl->ctrl.opts->nr_write_queues)
- set->map[HCTX_TYPE_POLL].queue_offset +=
- ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
}
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d/%d default/read/poll queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
+
return 0;
}