From 4e68a011428af3211facd932b4003b3fa3ef4faa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:52 +0200 Subject: blk-mq: don't redistribute hardware queues on a CPU hotplug event Currently blk-mq will totally remap hardware context when a CPU hotplug even happened, which causes major havoc for drivers, as they are never told about this remapping. E.g. any carefully sorted out CPU affinity will just be completely messed up. The rebuild also doesn't really help for the common case of cpu hotplug, which is soft onlining / offlining of cpus - in this case we should just leave the queue and irq mapping as is. If it actually worked it would have helped in the case of physical cpu hotplug, although for that we'd need a way to actually notify the driver. Note that drivers may already be able to accommodate such a topology change on their own, e.g. using the reset_controller sysfs file in NVMe will cause the driver to get things right for this case. With the rebuild removed we will simplify retain the queue mapping for a soft offlined CPU that will work when it comes back online, and will map any newly onlined CPU to queue 0 until the driver initiates a rebuild of the queue map. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7ddc7969fba4..ffc9d4a3dbbd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2157,8 +2157,6 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); - /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe * we should change hctx numa_node according to new topology (this -- cgit v1.2.3 From bdd17e75cd97c5c39feee409890a91d0396640fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:53 +0200 Subject: blk-mq: only allocate a single mq_map per tag_set The mapping is identical for all queues in a tag_set, so stop wasting memory for building multiple. Note that for now I've kept the mq_map pointer in the request_queue, but we'll need to investigate if we can remove it without suffering too much from the additional pointer chasing. The same would apply to the mq_ops pointer as well. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ++++++++++++++-------- include/linux/blk-mq.h | 1 + 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ffc9d4a3dbbd..c9499f118ef6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1973,7 +1973,6 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } - kfree(q->mq_map); q->mq_map = NULL; kfree(q->queue_hw_ctx); @@ -2072,9 +2071,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_percpu; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) - goto err_map; + q->mq_map = set->mq_map; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2124,8 +2121,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(q->mq_map); -err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); @@ -2347,14 +2342,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; + set->mq_map = blk_mq_make_queue_map(set); + if (!set->mq_map) + goto out_free_tags; + if (blk_mq_alloc_rq_maps(set)) - goto enomem; + goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -enomem: + +out_free_mq_map: + kfree(set->mq_map); + set->mq_map = NULL; +out_free_tags: kfree(set->tags); set->tags = NULL; return -ENOMEM; @@ -2370,6 +2373,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) blk_mq_free_rq_map(set, set->tags[i], i); } + kfree(set->mq_map); + set->mq_map = NULL; + kfree(set->tags); set->tags = NULL; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 60ef14cbcd2d..deda16a9bde4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -67,6 +67,7 @@ struct blk_mq_hw_ctx { }; struct blk_mq_tag_set { + unsigned int *mq_map; struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; /* max hw supported */ -- cgit v1.2.3 From 7d7e0f90b70f6c5367c2d1c9a7e87dd228bd0816 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:54 +0200 Subject: blk-mq: remove ->map_queue All drivers use the default, so provide an inline version of it. If we ever need other queue mapping we can add an optional method back, although supporting will also require major changes to the queue setup code. This provides better code generation, and better debugability as well. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-flush.c | 6 +++--- block/blk-mq-tag.c | 5 ++--- block/blk-mq.c | 40 +++++++++++---------------------------- block/blk-mq.h | 6 ++++++ block/blk.h | 11 +++-------- drivers/block/loop.c | 1 - drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/block/null_blk.c | 1 - drivers/block/rbd.c | 1 - drivers/block/virtio_blk.c | 1 - drivers/block/xen-blkfront.c | 1 - drivers/md/dm-rq.c | 1 - drivers/mtd/ubi/block.c | 1 - drivers/nvme/host/pci.c | 2 -- drivers/nvme/host/rdma.c | 2 -- drivers/nvme/target/loop.c | 2 -- drivers/scsi/scsi_lib.c | 1 - include/linux/blk-mq.h | 7 ------- 18 files changed, 25 insertions(+), 65 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index d308def812db..6a14b68b9135 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, int error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } @@ -325,7 +325,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) flush_rq->tag = first_rq->tag; fq->orig_rq = first_rq; - hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); } @@ -358,7 +358,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); /* * After populating an empty queue, kick it to avoid stall. Read diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 729bac3a673b..16028130289f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -301,8 +301,7 @@ static int bt_get(struct blk_mq_alloc_data *data, io_schedule(); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = data->q->mq_ops->map_queue(data->q, - data->ctx->cpu); + data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { @@ -726,7 +725,7 @@ u32 blk_mq_unique_tag(struct request *rq) int hwq = 0; if (q->mq_ops) { - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); hwq = hctx->queue_num; } diff --git a/block/blk-mq.c b/block/blk-mq.c index c9499f118ef6..6e077a9d61a8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -245,7 +245,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw, 0); @@ -254,7 +254,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, blk_mq_put_ctx(ctx); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw, 0); ctx = alloc_data.ctx; @@ -338,11 +338,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; - - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); - blk_mq_free_hctx_request(hctx, rq); + blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -1074,9 +1070,7 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); @@ -1093,12 +1087,10 @@ static void blk_mq_insert_requests(struct request_queue *q, bool from_schedule) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); trace_block_unplug(q, depth, !from_schedule); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1232,7 +1224,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); if (rw_is_sync(bio_op(bio), bio->bi_opf)) op_flags |= REQ_SYNC; @@ -1246,7 +1238,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, trace_block_sleeprq(q, bio, op); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); ctx = alloc_data.ctx; @@ -1263,8 +1255,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, - rq->mq_ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, @@ -1468,15 +1459,6 @@ run_queue: return cookie; } -/* - * Default mapping to a software queue, since we use one per CPU. - */ -struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) -{ - return q->queue_hw_ctx[q->mq_map[cpu]]; -} -EXPORT_SYMBOL(blk_mq_map_queue); - static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -1810,7 +1792,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, if (!cpu_online(i)) continue; - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); /* * Set local node, IFF we have more than one hw queue. If @@ -1848,7 +1830,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; @@ -2313,7 +2295,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b7..ec774bf4aea2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -52,6 +52,12 @@ extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, const struct cpumask *online_mask); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + int cpu) +{ + return q->queue_hw_ctx[q->mq_map[cpu]]; +} + /* * sysfs helpers */ diff --git a/block/blk.h b/block/blk.h index c37492f5edaa..74444c49078f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -39,14 +39,9 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - struct blk_mq_hw_ctx *hctx; - - if (!q->mq_ops) - return q->fq; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - return hctx->fq; + if (q->mq_ops) + return blk_mq_map_queue(q, ctx->cpu)->fq; + return q->fq; } static inline void __blk_get_queue(struct request_queue *q) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c9f2107f7095..cbdb3b162718 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1703,7 +1703,6 @@ static int loop_init_request(void *data, struct request *rq, static struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, - .map_queue = blk_mq_map_queue, .init_request = loop_init_request, }; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 88c46853dbb5..3cfd879267b2 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3895,7 +3895,6 @@ exit_handler: static struct blk_mq_ops mtip_mq_ops = { .queue_rq = mtip_queue_rq, - .map_queue = blk_mq_map_queue, .init_request = mtip_init_cmd, .exit_request = mtip_free_cmd, .complete = mtip_softirq_done_fn, diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 75a7f88d6717..7d3b7d6e5149 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -393,7 +393,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, - .map_queue = blk_mq_map_queue, .init_hctx = null_init_hctx, .complete = null_softirq_done_fn, }; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c6519f6492a..c1f84df7838b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3621,7 +3621,6 @@ static int rbd_init_request(void *data, struct request *rq, static struct blk_mq_ops rbd_mq_ops = { .queue_rq = rbd_queue_rq, - .map_queue = blk_mq_map_queue, .init_request = rbd_init_request, }; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 93b1aaa5ba3b..2dc5c96c186a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -542,7 +542,6 @@ static int virtblk_init_request(void *data, struct request *rq, static struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, - .map_queue = blk_mq_map_queue, .complete = virtblk_request_done, .init_request = virtblk_init_request, }; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 88ef6d4729b4..9908597c5209 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -909,7 +909,6 @@ out_busy: static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, - .map_queue = blk_mq_map_queue, }; static void blkif_set_queue_limits(struct blkfront_info *info) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1ca7463e8bb2..d1c3645d5ce1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -908,7 +908,6 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, static struct blk_mq_ops dm_mq_ops = { .queue_rq = dm_mq_queue_rq, - .map_queue = blk_mq_map_queue, .complete = dm_softirq_done, .init_request = dm_mq_init_request, }; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index ebf46ad2d513..d1e6931c132f 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -351,7 +351,6 @@ static int ubiblock_init_request(void *data, struct request *req, static struct blk_mq_ops ubiblock_mq_ops = { .queue_rq = ubiblock_queue_rq, .init_request = ubiblock_init_request, - .map_queue = blk_mq_map_queue, }; static DEFINE_IDR(ubiblock_minor_idr); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8dcf5a960951..086fd7e45119 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1131,7 +1131,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_complete_rq, - .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, .init_request = nvme_admin_init_request, @@ -1141,7 +1140,6 @@ static struct blk_mq_ops nvme_mq_admin_ops = { static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_complete_rq, - .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, .timeout = nvme_timeout, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ab545fb347a0..9bbd8866363b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1531,7 +1531,6 @@ static void nvme_rdma_complete_rq(struct request *rq) static struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, - .map_queue = blk_mq_map_queue, .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, .reinit_request = nvme_rdma_reinit_request, @@ -1543,7 +1542,6 @@ static struct blk_mq_ops nvme_rdma_mq_ops = { static struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, - .map_queue = blk_mq_map_queue, .init_request = nvme_rdma_init_admin_request, .exit_request = nvme_rdma_exit_admin_request, .reinit_request = nvme_rdma_reinit_request, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 395e60dad835..d5df77d686b2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -273,7 +273,6 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, static struct blk_mq_ops nvme_loop_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, - .map_queue = blk_mq_map_queue, .init_request = nvme_loop_init_request, .init_hctx = nvme_loop_init_hctx, .timeout = nvme_loop_timeout, @@ -282,7 +281,6 @@ static struct blk_mq_ops nvme_loop_mq_ops = { static struct blk_mq_ops nvme_loop_admin_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, - .map_queue = blk_mq_map_queue, .init_request = nvme_loop_init_admin_request, .init_hctx = nvme_loop_init_admin_hctx, .timeout = nvme_loop_timeout, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c71344aebdbb..2cca9cffc63f 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2077,7 +2077,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) } static struct blk_mq_ops scsi_mq_ops = { - .map_queue = blk_mq_map_queue, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index deda16a9bde4..f01379f2b0ac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -91,7 +91,6 @@ struct blk_mq_queue_data { }; typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); -typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -113,11 +112,6 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; - /* - * Map to specific hardware queue - */ - map_queue_fn *map_queue; - /* * Called on request timeout */ @@ -223,7 +217,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } -struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); -- cgit v1.2.3 From da695ba236b993f07a540d35c17f271ef08c89f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:55 +0200 Subject: blk-mq: allow the driver to pass in a queue mapping This allows drivers specify their own queue mapping by overriding the setup-time function that builds the mq_map. This can be used for example to build the map based on the MSI-X vector mapping provided by the core interrupt layer for PCI devices. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 25 +++++-------------------- block/blk-mq.c | 18 +++++++++++++++--- block/blk-mq.h | 4 +--- include/linux/blk-mq.h | 3 +++ 4 files changed, 24 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index d0634bcf322f..19b1d9c5f07e 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -31,14 +31,16 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask) +int blk_mq_map_queues(struct blk_mq_tag_set *set) { + unsigned int *map = set->mq_map; + unsigned int nr_queues = set->nr_hw_queues; + const struct cpumask *online_mask = cpu_online_mask; unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) - return 1; + return -ENOMEM; cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; @@ -86,23 +88,6 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, return 0; } -unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) -{ - unsigned int *map; - - /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, - set->numa_node); - if (!map) - return NULL; - - if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) - return map; - - kfree(map); - return NULL; -} - /* * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. diff --git a/block/blk-mq.c b/block/blk-mq.c index 6e077a9d61a8..a3060078a8da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2286,6 +2286,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + int ret; + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) @@ -2324,11 +2326,21 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; - set->mq_map = blk_mq_make_queue_map(set); + ret = -ENOMEM; + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, + GFP_KERNEL, set->numa_node); if (!set->mq_map) goto out_free_tags; - if (blk_mq_alloc_rq_maps(set)) + if (set->ops->map_queues) + ret = set->ops->map_queues(set); + else + ret = blk_mq_map_queues(set); + if (ret) + goto out_free_mq_map; + + ret = blk_mq_alloc_rq_maps(set); + if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); @@ -2342,7 +2354,7 @@ out_free_mq_map: out_free_tags: kfree(set->tags); set->tags = NULL; - return -ENOMEM; + return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); diff --git a/block/blk-mq.h b/block/blk-mq.h index ec774bf4aea2..c92bb7debf85 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -47,9 +47,7 @@ void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask); +int blk_mq_map_queues(struct blk_mq_tag_set *set); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f01379f2b0ac..6737fd7946f4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -104,6 +104,7 @@ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef void (busy_tag_iter_fn)(struct request *, void *, bool); typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef int (map_queues_fn)(struct blk_mq_tag_set *set); struct blk_mq_ops { @@ -144,6 +145,8 @@ struct blk_mq_ops { init_request_fn *init_request; exit_request_fn *exit_request; reinit_request_fn *reinit_request; + + map_queues_fn *map_queues; }; enum { -- cgit v1.2.3 From 973c4e372c8f71a15ac39765e657ded70fc87d41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:56 +0200 Subject: blk-mq: provide a default queue mapping for PCI device Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-mq-pci.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq-pci.h | 9 +++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 block/blk-mq-pci.c create mode 100644 include/linux/blk-mq-pci.h (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..2447a0b1ef9c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -22,4 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o - +obj-$(CONFIG_PCI) += blk-mq-pci.o diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c new file mode 100644 index 000000000000..33c7bd743c63 --- /dev/null +++ b/block/blk-mq-pci.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include + +/** + * blk_mq_pci_map_queues - provide a default queue mapping for PCI device + * @set: tagset to provide the mapping for + * @pdev: PCI device associated with @set. + * + * This function assumes the PCI device @pdev has at least as many available + * interrupt vetors as @set has queues. It will then queuery the vector + * corresponding to each queue for it's affinity mask and built queue mapping + * that maps a queue to the CPUs that have irq affinity for the corresponding + * vector. + */ +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = pci_irq_get_affinity(pdev, queue); + if (!mask) + return -EINVAL; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h new file mode 100644 index 000000000000..6ab595259112 --- /dev/null +++ b/include/linux/blk-mq-pci.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_BLK_MQ_PCI_H +#define _LINUX_BLK_MQ_PCI_H + +struct blk_mq_tag_set; +struct pci_dev; + +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev); + +#endif /* _LINUX_BLK_MQ_PCI_H */ -- cgit v1.2.3 From 1b157939f92ae22d10b9d52baaa14f826927f5ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 16:18:59 +0200 Subject: blk-mq: get rid of the cpumask in struct blk_mq_tags Unused now that NVMe sets up irq affinity before calling into blk-mq. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 6 ------ block/blk-mq-tag.h | 1 - block/blk-mq.c | 25 +++++++++++++++++++++---- include/linux/blk-mq.h | 1 - 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 16028130289f..2eae3d5f7145 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -665,11 +665,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { - kfree(tags); - return NULL; - } - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; @@ -680,7 +675,6 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) { bt_free(&tags->bitmap_tags); bt_free(&tags->breserved_tags); - free_cpumask_var(tags->cpumask); kfree(tags); } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d468a79f2c4a..556964134d1c 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -44,7 +44,6 @@ struct blk_mq_tags { struct list_head page_list; int alloc_policy; - cpumask_var_t cpumask; }; diff --git a/block/blk-mq.c b/block/blk-mq.c index a3060078a8da..060b350d3f0c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1861,7 +1861,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); - cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping @@ -2272,11 +2271,29 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } -struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, + const struct cpumask *affinity_mask) { - return tags->cpumask; + int queue = -1, cpu = 0; + + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, + GFP_KERNEL, set->numa_node); + if (!set->mq_map) + return -ENOMEM; + + if (!affinity_mask) + return 0; /* map all cpus to queue 0 */ + + /* If cpus are offline, map them to first hctx */ + for_each_online_cpu(cpu) { + if (cpumask_test_cpu(cpu, affinity_mask)) + queue++; + if (queue >= 0) + set->mq_map[cpu] = queue; + } + + return 0; } -EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); /* * Alloc a tag set to be associated with one or more request queues. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6737fd7946f4..c5a97d7cef93 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -201,7 +201,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); -struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, -- cgit v1.2.3 From 9151bcb4fb38aab04cdc67cc3b3e11396db1b8b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Sep 2016 08:45:45 -0600 Subject: blk-mq: kill unused blk_mq_create_mq_map() Fixes 1b157939f92a ("blk-mq: get rid of the cpumask in struct blk_mq_tags") Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 060b350d3f0c..f1c5263c44e8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2271,30 +2271,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } -static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, - const struct cpumask *affinity_mask) -{ - int queue = -1, cpu = 0; - - set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, - GFP_KERNEL, set->numa_node); - if (!set->mq_map) - return -ENOMEM; - - if (!affinity_mask) - return 0; /* map all cpus to queue 0 */ - - /* If cpus are offline, map them to first hctx */ - for_each_online_cpu(cpu) { - if (cpumask_test_cpu(cpu, affinity_mask)) - queue++; - if (queue >= 0) - set->mq_map[cpu] = queue; - } - - return 0; -} - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the -- cgit v1.2.3 From 8ec2ef2b66ea2fd00acc28aca8edaad441dbb424 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 19 Sep 2016 15:50:16 +1000 Subject: blk_mq: linux/blk-mq.h does not include all the headers it depends on and building block/blk-mq-pci.o should depend on CONFIG_BLOCK Fixes: 973c4e372c8f ("blk-mq: provide a default queue mapping for PCI device") Signed-off-by: Stephen Rothwell Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 5 +++++ block/Makefile | 2 +- block/blk-mq-pci.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..515533802af1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -124,4 +124,9 @@ config BLOCK_COMPAT depends on BLOCK && COMPAT default y +config BLK_MQ_PCI + bool + depends on BLOCK && PCI + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 2447a0b1ef9c..37a0d93f97bb 100644 --- a/block/Makefile +++ b/block/Makefile @@ -22,4 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o -obj-$(CONFIG_PCI) += blk-mq-pci.o +obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 33c7bd743c63..966c2169762e 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -10,6 +10,8 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ +#include +#include #include #include #include -- cgit v1.2.3