From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Aug 2014 23:16:32 +0800 Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!" Before doing queue release, the queue has been freezed already by blk_cleanup_queue(), so needn't to freeze queue for deleting tag set. This patch fixes the WARNING of "percpu_ref_kill() called more than once!" which is triggered during unloading block driver. Cc: Tejun Heo Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..ac8a0413664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; - blk_mq_freeze_queue(q); - mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); blk_mq_update_tag_set_depth(set); mutex_unlock(&set->tag_list_lock); - - blk_mq_unfreeze_queue(q); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, -- cgit v1.2.3 From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:44:08 -0600 Subject: blk-mq: don't allow merges if turned off for the queue blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time, to determine whether it should merge IO or not. However, this could also be disabled by the admin, if merging is switched off through sysfs. So check the general queue state as well before attempting to merge IO. Reported-by: Rob Elliott Tested-by: Rob Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index ac8a0413664e..c9e89a8792e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) +{ + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} + static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - struct request_queue *q = hctx->queue; - - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + if (!hctx_allow_merges(hctx)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1082,6 +1086,8 @@ insert_rq: spin_unlock(&ctx->lock); return false; } else { + struct request_queue *q = hctx->queue; + spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { blk_mq_bio_to_request(rq, bio); -- cgit v1.2.3 From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 13:19:15 -0600 Subject: blk-mq: correct a few wrong/bad comments Just grammar or spelling errors, nothing major. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c9e89a8792e3..a0565bb20fd5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->tags = set->tags[i]; /* - * Allocate space for all possible cpus to avoid allocation in + * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), @@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { /* - * If not software queues are mapped to this hardware queue, - * disable it and free the request entries + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. */ if (!hctx->nr_ctx) { struct blk_mq_tag_set *set = q->tag_set; -- cgit v1.2.3 From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 16 Aug 2014 08:02:24 -0400 Subject: blk-mq: blk_mq_freeze_queue() should allow nesting While converting to percpu_ref for freezing, add703fda981 ("blk-mq: use percpu_ref for mq usage count") incorrectly made blk_mq_freeze_queue() misbehave when freezing is nested due to percpu_ref_kill() being invoked on an already killed ref. Fix it by making blk_mq_freeze_queue() kill and kick the queue only for the outermost freeze attempt. All the nested ones can simply wait for the ref to reach zero. While at it, remove unnecessary @wake initialization from blk_mq_unfreeze_queue(). Signed-off-by: Tejun Heo Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a0565bb20fd5..7950f8d7c1bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) */ void blk_mq_freeze_queue(struct request_queue *q) { + bool freeze; + spin_lock_irq(q->queue_lock); - q->mq_freeze_depth++; + freeze = !q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + if (freeze) { + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + bool wake; spin_lock_irq(q->queue_lock); wake = !--q->mq_freeze_depth; -- cgit v1.2.3 From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:39 -0400 Subject: scsi-mq: fix requests that use a separate CDB buffer This patch fixes code such as the following with scsi-mq enabled: rq = blk_get_request(...); blk_rq_set_block_pc(rq); rq->cmd = my_cmd_buffer; /* separate CDB buffer */ blk_execute_rq_nowait(...); Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for large CDBs only). Without this patch, scsi_mq_prep_fn() will set rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device. Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-mq.c | 2 ++ drivers/scsi/scsi_lib.c | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index c359d72e9d76..bf930f481d43 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq) rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); - rq->cmd = rq->__cmd; } EXPORT_SYMBOL(blk_rq_set_block_pc); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7950f8d7c1bb..4aac82615a46 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* tag was already set */ rq->errors = 0; + rq->cmd = rq->__cmd; + rq->extra_len = 0; rq->sense_len = 0; rq->resid_len = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9c44392b748f..d86808f051e0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1808,7 +1808,6 @@ static int scsi_mq_prep_fn(struct request *req) cmd->tag = req->tag; - req->cmd = req->__cmd; cmd->cmnd = req->cmd; cmd->prot_op = SCSI_PROT_NORMAL; -- cgit v1.2.3 From 5676e7b6db02b80eafc2e3ad316d5f2fee817ecb Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 2 Sep 2014 11:38:44 -0500 Subject: blk-mq: cleanup after blk_mq_init_rq_map failures In blk-mq.c blk_mq_alloc_tag_set, if: set->tags = kmalloc_node() succeeds, but one of the blk_mq_init_rq_map() calls fails, goto out_unwind; needs to free set->tags so the caller is not obligated to do so. None of the current callers (null_blk, virtio_blk, virtio_blk, or the forthcoming scsi-mq) do so. set->tags needs to be set to NULL after doing so, so other tag cleanup logic doesn't try to free a stale pointer later. Also set it to NULL in blk_mq_free_tag_set. Tested with error injection on the forthcoming scsi-mq + hpsa combination. Signed-off-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4aac82615a46..f9b85e83d9ba 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1982,6 +1982,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) blk_mq_free_rq_map(set, set->tags[i], i); + kfree(set->tags); + set->tags = NULL; out: return -ENOMEM; } @@ -1997,6 +1999,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } kfree(set->tags); + set->tags = NULL; } EXPORT_SYMBOL(blk_mq_free_tag_set); -- cgit v1.2.3 From a516440542afcb9647f88d12c35640baf02d07ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Sep 2014 09:02:03 -0600 Subject: blk-mq: scale depth and rq map appropriate if low on memory If we are running in a kdump environment, resources are scarce. For some SCSI setups with a huge set of shared tags, we run out of memory allocating what the drivers is asking for. So implement a scale back logic to reduce the tag depth for those cases, allowing the driver to successfully load. We should extend this to detect low memory situations, and implement a sane fallback for those (1 queue, 64 tags, or something like that). Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 88 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 19 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f9b85e83d9ba..383ea0cb1f0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1321,6 +1321,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, continue; set->ops->exit_request(set->driver_data, tags->rqs[i], hctx_idx, i); + tags->rqs[i] = NULL; } } @@ -1354,8 +1355,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&tags->page_list); - tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), - GFP_KERNEL, set->numa_node); + tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1379,8 +1381,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, this_order--; do { - page = alloc_pages_node(set->numa_node, GFP_KERNEL, - this_order); + page = alloc_pages_node(set->numa_node, + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + this_order); if (page) break; if (!this_order--) @@ -1404,8 +1407,10 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, - set->numa_node)) + set->numa_node)) { + tags->rqs[i] = NULL; goto fail; + } } p += rq_size; @@ -1416,7 +1421,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return tags; fail: - pr_warn("%s: failed to allocate requests\n", __func__); blk_mq_free_rq_map(set, tags, hctx_idx); return NULL; } @@ -1936,6 +1940,61 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); + + set->tags = NULL; + return -ENOMEM; +} + +/* + * Allocate the request maps associated with this tag_set. Note that this + * may reduce the depth asked for, if memory is tight. set->queue_depth + * will be updated to reflect the allocated depth. + */ +static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + unsigned int depth; + int err; + + depth = set->queue_depth; + do { + err = __blk_mq_alloc_rq_maps(set); + if (!err) + break; + + set->queue_depth >>= 1; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { + err = -ENOMEM; + break; + } + } while (set->queue_depth); + + if (!set->queue_depth || err) { + pr_err("blk-mq: failed to allocate request map\n"); + return -ENOMEM; + } + + if (depth != set->queue_depth) + pr_info("blk-mq: reduced tag depth (%u -> %u)\n", + depth, set->queue_depth); + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -1944,8 +2003,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int i; - if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) @@ -1966,25 +2023,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out; + return -ENOMEM; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) - goto out_unwind; - } + if (blk_mq_alloc_rq_maps(set)) + goto enomem; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; - -out_unwind: - while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); +enomem: kfree(set->tags); set->tags = NULL; -out: return -ENOMEM; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); -- cgit v1.2.3 From 538b75341835e3c2041ff066408de10d24fdc830 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Sep 2014 10:37:37 -0600 Subject: blk-mq: request deadline must be visible before marking rq as started When we start the request, we set the deadline and flip the bits marking the request as started and non-complete. However, it's important that the deadline store is ordered before flipping the bits, otherwise we could have a small window where the request is marked started but with an invalid deadline. This can confuse the timeout handling. Suggested-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 383ea0cb1f0a..a13c40ca8230 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -392,6 +392,12 @@ static void blk_mq_start_request(struct request *rq, bool last) blk_add_timer(rq); + /* + * Ensure that ->deadline is visible before set the started + * flag and clear the completed flag. + */ + smp_mb__before_atomic(); + /* * Mark us as started and clear complete. Complete might have been * set if requeue raced with timeout, which then marked it as -- cgit v1.2.3 From 683d0e126232d898a481daa3a4ca032c2b1a9660 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Sep 2014 11:04:31 +0200 Subject: blk-mq: Avoid race condition with uninitialized requests This patch should fix the bug reported in https://lkml.org/lkml/2014/9/11/249. We have to initialize at least the atomic_flags and the cmd_flags when allocating storage for the requests. Otherwise blk_mq_timeout_check() might dereference uninitialized pointers when racing with the creation of a request. Also move the reset of cmd_flags for the initializing code to the point where a request is freed. So we will never end up with pending flush request indicators that might trigger dereferences of invalid pointers in blk_mq_timeout_check(). Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand Reported-by: Paulo De Rezende Pinatti Tested-by: Paulo De Rezende Pinatti Acked-by: Christian Borntraeger Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a13c40ca8230..1583ed28ea03 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -203,7 +203,6 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) if (tag != BLK_MQ_TAG_FAIL) { rq = data->hctx->tags->rqs[tag]; - rq->cmd_flags = 0; if (blk_mq_tag_busy(data->hctx)) { rq->cmd_flags = REQ_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); @@ -258,6 +257,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, tag, &ctx->last_tag); @@ -1410,6 +1410,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, left -= to_do * rq_size; for (j = 0; j < to_do; j++) { tags->rqs[i] = p; + tags->rqs[i]->atomic_flags = 0; + tags->rqs[i]->cmd_flags = 0; if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, -- cgit v1.2.3 From a57a178a490345c7236b0077b3de005754389ed6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Sep 2014 14:44:07 -0700 Subject: blk-mq: avoid infinite recursion with the FUA flag We should not insert requests into the flush state machine from blk_mq_insert_request. All incoming flush requests come through blk_{m,s}q_make_request and are handled there, while blk_execute_rq_nowait should only be called for BLOCK_PC requests. All other callers deal with requests that already went through the flush statemchine and shouldn't be reinserted into it. Reported-by: Robert Elliott Debugged-by: Ming Lei Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 1 + block/blk-mq.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-exec.c b/block/blk-exec.c index f4d27b12c90b..9924725fa50d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -56,6 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, bool is_pm_resume; WARN_ON(irqs_disabled()); + WARN_ON(rq->cmd_type == REQ_TYPE_FS); rq->rq_disk = bd_disk; rq->end_io = done; diff --git a/block/blk-mq.c b/block/blk-mq.c index 1583ed28ea03..a7d70a1bbf36 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -963,14 +963,9 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && - !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { - blk_insert_flush(rq); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); if (run_queue) blk_mq_run_hw_queue(hctx, async); -- cgit v1.2.3 From 6b55e1f2d0a5e462e52678278ab749468f1db81c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Sep 2014 08:04:53 -0600 Subject: blk-mq: fix potential oops on out-of-memory in __blk_mq_alloc_rq_maps() __blk_mq_alloc_rq_maps() can be invoked multiple times, if we scale back the queue depth if we are low on memory. So don't clear set->tags when we fail, this is handled directly in the parent function, blk_mq_alloc_tag_set(). Reported-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a7d70a1bbf36..e83d306907da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1959,7 +1959,6 @@ out_unwind: while (--i >= 0) blk_mq_free_rq_map(set, set->tags[i], i); - set->tags = NULL; return -ENOMEM; } -- cgit v1.2.3 From 8b95741569eabc5eb17da71d1d3668cdb0bef86c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Sep 2014 13:10:29 -0600 Subject: blk-mq: use blk_mq_start_hw_queues() when running requeue work When requests are retried due to hw or sw resource shortages, we often stop the associated hardware queue. So ensure that we restart the queues when running the requeue work, otherwise the queue run will be a no-op. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index e83d306907da..c88e6089746d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -479,7 +479,11 @@ static void blk_mq_requeue_work(struct work_struct *work) blk_mq_insert_request(rq, false, false, false); } - blk_mq_run_queues(q, false); + /* + * Use the start variant of queue running here, so that running + * the requeue work will kick stopped queues. + */ + blk_mq_start_hw_queues(q); } void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) -- cgit v1.2.3 From 0a30288da1aec914e158c2d7a3482a85f632750f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2014 15:24:32 -0400 Subject: blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe blk-mq uses percpu_ref for its usage counter which tracks the number of in-flight commands and used to synchronously drain the queue on freeze. percpu_ref shutdown takes measureable wallclock time as it involves a sched RCU grace period. This means that draining a blk-mq takes measureable wallclock time. One would think that this shouldn't matter as queue shutdown should be a rare event which takes place asynchronously w.r.t. userland. Unfortunately, SCSI probing involves synchronously setting up and then tearing down a lot of request_queues back-to-back for non-existent LUNs. This means that SCSI probing may take more than ten seconds when scsi-mq is used. This will be properly fixed by implementing a mechanism to keep q->mq_usage_counter in atomic mode till genhd registration; however, that involves rather big updates to percpu_ref which is difficult to apply late in the devel cycle (v3.17-rc6 at the moment). As a stop-gap measure till the proper fix can be implemented in the next cycle, this patch introduces __percpu_ref_kill_expedited() and makes blk_mq_freeze_queue() use it. This is heavy-handed but should work for testing the experimental SCSI blk-mq implementation. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count") Cc: Kent Overstreet Cc: Jens Axboe Tested-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- include/linux/percpu-refcount.h | 1 + lib/percpu-refcount.c | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c88e6089746d..df8e1e09dd17 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,7 +119,16 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - percpu_ref_kill(&q->mq_usage_counter); + /* + * XXX: Temporary kludge to work around SCSI blk-mq stall. + * SCSI synchronously creates and destroys many queues + * back-to-back during probe leading to lengthy stalls. + * This will be fixed by keeping ->mq_usage_counter in + * atomic mode until genhd registration, but, for now, + * let's work around using expedited synchronization. + */ + __percpu_ref_kill_expedited(&q->mq_usage_counter); + blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf237cd8f..ef5894ca8e50 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -71,6 +71,7 @@ void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void __percpu_ref_kill_expedited(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a3342e960..a89cf09a8268 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -184,3 +184,19 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/* + * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by + * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 + * devel cycle. Do not use anywhere else. + */ +void __percpu_ref_kill_expedited(struct percpu_ref *ref) +{ + WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, + "percpu_ref_kill() called more than once on %pf!", + ref->release); + + ref->pcpu_count_ptr |= PCPU_REF_DEAD; + synchronize_sched_expedited(); + percpu_ref_kill_rcu(&ref->rcu); +} -- cgit v1.2.3