From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Aug 2014 23:16:32 +0800 Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!" Before doing queue release, the queue has been freezed already by blk_cleanup_queue(), so needn't to freeze queue for deleting tag set. This patch fixes the WARNING of "percpu_ref_kill() called more than once!" which is triggered during unloading block driver. Cc: Tejun Heo Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..ac8a0413664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; - blk_mq_freeze_queue(q); - mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); blk_mq_update_tag_set_depth(set); mutex_unlock(&set->tag_list_lock); - - blk_mq_unfreeze_queue(q); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, -- cgit v1.2.3 From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:44:08 -0600 Subject: blk-mq: don't allow merges if turned off for the queue blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time, to determine whether it should merge IO or not. However, this could also be disabled by the admin, if merging is switched off through sysfs. So check the general queue state as well before attempting to merge IO. Reported-by: Rob Elliott Tested-by: Rob Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index ac8a0413664e..c9e89a8792e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) +{ + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} + static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - struct request_queue *q = hctx->queue; - - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + if (!hctx_allow_merges(hctx)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1082,6 +1086,8 @@ insert_rq: spin_unlock(&ctx->lock); return false; } else { + struct request_queue *q = hctx->queue; + spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { blk_mq_bio_to_request(rq, bio); -- cgit v1.2.3 From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 13:19:15 -0600 Subject: blk-mq: correct a few wrong/bad comments Just grammar or spelling errors, nothing major. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c9e89a8792e3..a0565bb20fd5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->tags = set->tags[i]; /* - * Allocate space for all possible cpus to avoid allocation in + * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), @@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { /* - * If not software queues are mapped to this hardware queue, - * disable it and free the request entries + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. */ if (!hctx->nr_ctx) { struct blk_mq_tag_set *set = q->tag_set; -- cgit v1.2.3 From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 16 Aug 2014 08:02:24 -0400 Subject: blk-mq: blk_mq_freeze_queue() should allow nesting While converting to percpu_ref for freezing, add703fda981 ("blk-mq: use percpu_ref for mq usage count") incorrectly made blk_mq_freeze_queue() misbehave when freezing is nested due to percpu_ref_kill() being invoked on an already killed ref. Fix it by making blk_mq_freeze_queue() kill and kick the queue only for the outermost freeze attempt. All the nested ones can simply wait for the ref to reach zero. While at it, remove unnecessary @wake initialization from blk_mq_unfreeze_queue(). Signed-off-by: Tejun Heo Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a0565bb20fd5..7950f8d7c1bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) */ void blk_mq_freeze_queue(struct request_queue *q) { + bool freeze; + spin_lock_irq(q->queue_lock); - q->mq_freeze_depth++; + freeze = !q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + if (freeze) { + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + bool wake; spin_lock_irq(q->queue_lock); wake = !--q->mq_freeze_depth; -- cgit v1.2.3 From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:39 -0400 Subject: scsi-mq: fix requests that use a separate CDB buffer This patch fixes code such as the following with scsi-mq enabled: rq = blk_get_request(...); blk_rq_set_block_pc(rq); rq->cmd = my_cmd_buffer; /* separate CDB buffer */ blk_execute_rq_nowait(...); Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for large CDBs only). Without this patch, scsi_mq_prep_fn() will set rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device. Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-mq.c | 2 ++ drivers/scsi/scsi_lib.c | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index c359d72e9d76..bf930f481d43 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq) rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); - rq->cmd = rq->__cmd; } EXPORT_SYMBOL(blk_rq_set_block_pc); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7950f8d7c1bb..4aac82615a46 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* tag was already set */ rq->errors = 0; + rq->cmd = rq->__cmd; + rq->extra_len = 0; rq->sense_len = 0; rq->resid_len = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9c44392b748f..d86808f051e0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1808,7 +1808,6 @@ static int scsi_mq_prep_fn(struct request *req) cmd->tag = req->tag; - req->cmd = req->__cmd; cmd->cmnd = req->cmd; cmd->prot_op = SCSI_PROT_NORMAL; -- cgit v1.2.3 From 5676e7b6db02b80eafc2e3ad316d5f2fee817ecb Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 2 Sep 2014 11:38:44 -0500 Subject: blk-mq: cleanup after blk_mq_init_rq_map failures In blk-mq.c blk_mq_alloc_tag_set, if: set->tags = kmalloc_node() succeeds, but one of the blk_mq_init_rq_map() calls fails, goto out_unwind; needs to free set->tags so the caller is not obligated to do so. None of the current callers (null_blk, virtio_blk, virtio_blk, or the forthcoming scsi-mq) do so. set->tags needs to be set to NULL after doing so, so other tag cleanup logic doesn't try to free a stale pointer later. Also set it to NULL in blk_mq_free_tag_set. Tested with error injection on the forthcoming scsi-mq + hpsa combination. Signed-off-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4aac82615a46..f9b85e83d9ba 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1982,6 +1982,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) blk_mq_free_rq_map(set, set->tags[i], i); + kfree(set->tags); + set->tags = NULL; out: return -ENOMEM; } @@ -1997,6 +1999,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } kfree(set->tags); + set->tags = NULL; } EXPORT_SYMBOL(blk_mq_free_tag_set); -- cgit v1.2.3 From a516440542afcb9647f88d12c35640baf02d07ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Sep 2014 09:02:03 -0600 Subject: blk-mq: scale depth and rq map appropriate if low on memory If we are running in a kdump environment, resources are scarce. For some SCSI setups with a huge set of shared tags, we run out of memory allocating what the drivers is asking for. So implement a scale back logic to reduce the tag depth for those cases, allowing the driver to successfully load. We should extend this to detect low memory situations, and implement a sane fallback for those (1 queue, 64 tags, or something like that). Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 88 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 19 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f9b85e83d9ba..383ea0cb1f0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1321,6 +1321,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, continue; set->ops->exit_request(set->driver_data, tags->rqs[i], hctx_idx, i); + tags->rqs[i] = NULL; } } @@ -1354,8 +1355,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&tags->page_list); - tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), - GFP_KERNEL, set->numa_node); + tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1379,8 +1381,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, this_order--; do { - page = alloc_pages_node(set->numa_node, GFP_KERNEL, - this_order); + page = alloc_pages_node(set->numa_node, + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + this_order); if (page) break; if (!this_order--) @@ -1404,8 +1407,10 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, - set->numa_node)) + set->numa_node)) { + tags->rqs[i] = NULL; goto fail; + } } p += rq_size; @@ -1416,7 +1421,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return tags; fail: - pr_warn("%s: failed to allocate requests\n", __func__); blk_mq_free_rq_map(set, tags, hctx_idx); return NULL; } @@ -1936,6 +1940,61 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); + + set->tags = NULL; + return -ENOMEM; +} + +/* + * Allocate the request maps associated with this tag_set. Note that this + * may reduce the depth asked for, if memory is tight. set->queue_depth + * will be updated to reflect the allocated depth. + */ +static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + unsigned int depth; + int err; + + depth = set->queue_depth; + do { + err = __blk_mq_alloc_rq_maps(set); + if (!err) + break; + + set->queue_depth >>= 1; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { + err = -ENOMEM; + break; + } + } while (set->queue_depth); + + if (!set->queue_depth || err) { + pr_err("blk-mq: failed to allocate request map\n"); + return -ENOMEM; + } + + if (depth != set->queue_depth) + pr_info("blk-mq: reduced tag depth (%u -> %u)\n", + depth, set->queue_depth); + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -1944,8 +2003,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int i; - if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) @@ -1966,25 +2023,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out; + return -ENOMEM; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) - goto out_unwind; - } + if (blk_mq_alloc_rq_maps(set)) + goto enomem; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; - -out_unwind: - while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); +enomem: kfree(set->tags); set->tags = NULL; -out: return -ENOMEM; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); -- cgit v1.2.3