From f75782e4e067fd68249635699cb20dfe0489d743 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 14 Sep 2015 18:16:02 +0100 Subject: block: kmemleak: Track the page allocations for struct request The pages allocated for struct request contain pointers to other slab allocations (via ops->init_request). Since kmemleak does not track/scan page allocations, the slab objects will be reported as leaks (false positives). This patch adds kmemleak callbacks to allow tracking of such pages. Signed-off-by: Catalin Marinas Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f2d67b4047a0..2077f0d2f95f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1438,6 +1439,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); __free_pages(page, page->private); } @@ -1510,6 +1516,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, list_add_tail(&page->lru, &tags->page_list); p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, set->queue_depth - i); left -= to_do * rq_size; -- cgit v1.2.3 From 8ee1b7b9d8761a7b1a0f37a6e596b5315d27d2f2 Mon Sep 17 00:00:00 2001 From: Kosuke Tatsukawa Date: Fri, 9 Oct 2015 00:35:38 +0000 Subject: blk-mq: fix waitqueue_active without memory barrier in block/blk-mq-tag.c blk_mq_tag_update_depth() seems to be missing a memory barrier which might cause the waker to not notice the waiter and fail to send a wake_up as in the following figure. blk_mq_tag_update_depth bt_get ------------------------------------------------------------------------ if (waitqueue_active(&bs->wait)) /* The CPU might reorder the test for the waitqueue up here, before prior writes complete */ prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag, tags); /* Value set in bt_update_count not visible yet */ bt_update_count(&tags->bitmap_tags, tdepth); /* blk_mq_tag_wakeup_all(tags, false); */ bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); ... io_schedule(); ------------------------------------------------------------------------ This patch adds the missing memory barrier. I found this issue when I was looking through the linux source code for places calling waitqueue_active() before wake_up*(), but without preceding memory barriers, after sending a patch to fix a similar issue in drivers/tty/n_tty.c (Details about the original issue can be found here: https://lkml.org/lkml/2015/9/28/849). Signed-off-by: Kosuke Tatsukawa Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ed96474d75cb..7a6b6e27fc26 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) struct blk_mq_bitmap_tags *bt; int i, wake_index; + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { -- cgit v1.2.3 From 3380f4589f6d9725e275525fd0580c8ee2b5cbbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Oct 2015 18:57:06 +0200 Subject: blk-mq: remove unused blk_mq_clone_flush_request prototype Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index f4fea7964910..b44dce165761 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -29,8 +29,6 @@ void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -- cgit v1.2.3 From 0809e3ac62319dc7534b64f95ac37e230d740e8a Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 20 Oct 2015 23:13:51 +0800 Subject: block: fix plug list flushing for nomerge queues Request queues with merging disabled will not flush the plug list after BLK_MAX_REQUEST_COUNT requests have been queued, since the code relies on blk_attempt_plug_merge to compute the request_count. Fix this by computing the number of queued requests even for nomerge queues. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-core.c | 32 +++++++++++++++++++++++++++++--- block/blk-mq.c | 9 ++++++--- block/blk.h | 1 + 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2eb722d48773..f0ae087ead06 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1594,6 +1594,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1641,9 +1665,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index d921cd5177f5..9683a561efcd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1268,9 +1268,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return; + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return; + } else + request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) diff --git a/block/blk.h b/block/blk.h index 98614ad37c81..aa27d0292af1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -86,6 +86,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq); +unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); -- cgit v1.2.3 From bdced438acd83ad83a6c6fc7f50099b820245ddb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:52 +0800 Subject: block: setup bi_phys_segments after splitting The number of bio->bi_phys_segments is always obtained during bio splitting, so it is natural to setup it just after bio splitting, then we can avoid to compute nr_segment again during merge. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index c4e9c37f3e38..22293fdf6514 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -11,13 +11,16 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; int alignment; sector_t tmp; unsigned split_sectors; + *nsegs = 1; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); @@ -51,8 +54,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, static struct bio *blk_bio_write_same_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { + *nsegs = 1; + if (!q->limits.max_write_same_sectors) return NULL; @@ -64,7 +70,8 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *segs) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -106,22 +113,30 @@ new_segment: sectors += bv.bv_len >> 9; } + *segs = nsegs; return NULL; split: + *segs = nsegs; return bio_split(bio, sectors, GFP_NOIO, bs); } void blk_queue_split(struct request_queue *q, struct bio **bio, struct bio_set *bs) { - struct bio *split; + struct bio *split, *res; + unsigned nsegs; if ((*bio)->bi_rw & REQ_DISCARD) - split = blk_bio_discard_split(q, *bio, bs); + split = blk_bio_discard_split(q, *bio, bs, &nsegs); else if ((*bio)->bi_rw & REQ_WRITE_SAME) - split = blk_bio_write_same_split(q, *bio, bs); + split = blk_bio_write_same_split(q, *bio, bs, &nsegs); else - split = blk_bio_segment_split(q, *bio, q->bio_split); + split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + + /* physical segments can be figured out during splitting */ + res = split ? split : *bio; + res->bi_phys_segments = nsegs; + bio_set_flag(res, BIO_SEG_VALID); if (split) { bio_chain(split, *bio); -- cgit v1.2.3 From 6ac45aeb6bcad38a2783a7d6e5da4c469497eeb0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:53 +0800 Subject: block: avoid to merge splitted bio The splitted bio has been already too fat to merge, so mark it as NOMERGE. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 22293fdf6514..de5716d8e525 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -139,6 +139,9 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, bio_set_flag(res, BIO_SEG_VALID); if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_rw |= REQ_NOMERGE; + bio_chain(split, *bio); generic_make_request(*bio); *bio = split; -- cgit v1.2.3 From e18378a60e27ad7b3e11ecc4e2c92159585dee68 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:54 +0800 Subject: blk-mq: check bio_mergeable() early before merging It isn't necessary to try to merge the bio which is marked as NOMERGE. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9683a561efcd..d38371160019 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1140,7 +1140,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - if (!hctx_allow_merges(hctx)) { + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: -- cgit v1.2.3 From 7460d389c01741f0dfff733af93d3b3abd9b974e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:55 +0800 Subject: block: check bio_mergeable() early before merging After bio splitting is introduced, one bio can be splitted and it is marked as NOMERGE because it is too fat to be merged, so check bio_mergeable() earlier to avoid to try to merge it unnecessarily. Signed-off-by: Ming Lei Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 84d63943f2de..c3555c9c672f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ - if (blk_queue_nomerges(q)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* -- cgit v1.2.3 From 676d06077f964f06af52c19e59f0409a8880612f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:56 +0800 Subject: blk-mq: fix for trace_block_plug() The trace point is for tracing plug event of each request queue instead of each task, so we should check the request count in the plug list from current queue instead of current task. Signed-off-by: Ming Lei Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d38371160019..24c528f182ea 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1380,7 +1380,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (plug) { blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) + if (!request_count) trace_block_plug(q); else if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); -- cgit v1.2.3 From cfd0c552a8272d691691f40073654d775836e23a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:57 +0800 Subject: blk-mq: mark ctx as pending at batch in flush plug path Most of times, flush plug should be the hottest I/O path, so mark ctx as pending after all requests in the list are inserted. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 24c528f182ea..159e69bd2c3c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -990,18 +990,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_queue); -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); +} + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1057,8 +1064,9 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); blk_mq_run_hw_queue(hctx, from_schedule); -- cgit v1.2.3 From 2404e607a9ee36db361bebe32787dafa1f7d6c00 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 3 Nov 2015 10:40:06 -0500 Subject: blk-mq: avoid excessive boot delays with large lun counts Hi, Zhangqing Luo reported long boot times on a system with thousands of LUNs when scsi-mq was enabled. He narrowed the problem down to blk_mq_add_queue_tag_set, where every queue is frozen in order to set the BLK_MQ_F_TAG_SHARED flag. Each added device will freeze all queues added before it in sequence, which involves waiting for an RCU grace period for each one. We don't need to do this. After the second queue is added, only new queues need to be initialized with the shared tag. We can do that by percolating the flag up to the blk_mq_tag_set, and updating the newly added queue's hctxs if the flag is set. This problem was introduced by commit 0d2602ca30e41 (blk-mq: improve support for shared tags maps). Reported-and-tested-by: Jason Luo Reviewed-by: Ming Lei Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 159e69bd2c3c..22db728dbe24 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1695,7 +1695,7 @@ static int blk_mq_init_hctx(struct request_queue *q, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = hctx_idx; - hctx->flags = set->flags; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1882,27 +1882,26 @@ static void blk_mq_map_swqueue(struct request_queue *q, } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - bool shared; int i; - if (set->tag_list.next == set->tag_list.prev) - shared = false; - else - shared = true; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } + queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } @@ -1913,7 +1912,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); - blk_mq_update_tag_set_depth(set); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); + } mutex_unlock(&set->tag_list_lock); } @@ -1923,8 +1927,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, q->tag_set = set; mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); - blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); } -- cgit v1.2.3