From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Thu, 13 Jan 2022 10:55:36 +0800 Subject: blk-mq: fix tag_get wait task can't be awakened In case of shared tags, there might be more than one hctx which allocates from the same tags, and each hctx is limited to allocate at most: hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); tag idle detection is lazy, and may be delayed for 30sec, so there could be just one real active hctx(queue) but all others are actually idle and still accounted as active because of the lazy idle detection. Then if wake_batch is > hctx_max_depth, driver tag allocation may wait forever on this real active hctx. Fix this by recalculating wake_batch when inc or dec active_queues. Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") Suggested-by: Ming Lei Suggested-by: John Garry Signed-off-by: Laibin Qiu Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e55a6834c9a6..845f74e8dd7b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -16,6 +16,21 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -24,18 +39,26 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { + unsigned int users; + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { + return true; + } } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { + return true; + } } + users = atomic_inc_return(&hctx->tags->active_queues); + + blk_mq_update_wake_batch(hctx->tags, users); + return true; } @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - atomic_dec(&tags->active_queues); + users = atomic_dec_return(&tags->active_queues); + + blk_mq_update_wake_batch(tags, users); blk_mq_tag_wakeup_all(tags, false); } -- cgit v1.2.3 From e6a2e5116e07ce5acc8698785c29e9e47f010fd5 Mon Sep 17 00:00:00 2001 From: GuoYong Zheng Date: Mon, 17 Jan 2022 18:22:37 +0800 Subject: block: Remove unnecessary variable assignment The parameter "ret" should be zero when running to this line, no need to set to zero again, remove it. Signed-off-by: GuoYong Zheng Link: https://lore.kernel.org/r/1642414957-6785-1-git-send-email-zhenggy@chinatelecom.cn Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e20eadfcf5c8..bed4a2facd65 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -887,7 +887,6 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); -- cgit v1.2.3 From 850fd2abbe02eb2b52cbb1550adbcc89b36d65de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 Jan 2022 20:34:01 +0800 Subject: block: cleanup q->srcu srcu structure has to be cleanup via cleanup_srcu_struct(), so fix it. Reported-by: syzbot+4f789823c1abc5accf13@syzkaller.appspotmail.com Fixes: 704b914f15fb ("blk-mq: move srcu from blk_mq_hw_ctx to request_queue") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220111123401.520192-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bed4a2facd65..9f32882ceb2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -811,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj) bioset_exit(&q->bio_split); + if (blk_queue_has_srcu(q)) + cleanup_srcu_struct(q->srcu); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } -- cgit v1.2.3 From fd9f4e62a39f09a7c014d7415c2b9d1390aa0504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 08:04:44 +0100 Subject: block: assign bi_bdev for cloned bios in blk_rq_prep_clone bio_clone_fast() sets the cloned bio to have the same ->bi_bdev as the source bio. This means that when request-based dm called setup_clone(), the cloned bio had its ->bi_bdev pointing to the dm device. After Commit 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") __blk_account_io_start() started using the request's ->bio->bi_bdev for I/O accounting, if it was set. This caused IO going to the underlying devices to use the dm device for their I/O accounting. Set up the proper ->bi_bdev in blk_rq_prep_clone based on the whole device bdev for the queue the request is cloned onto. Fixes: 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") Reported-by: Benjamin Marzinski Signed-off-by: Christoph Hellwig [hch: the commit message is mostly from a different patch from Benjamin] Reviewed-by: Ming Lei Reviewed-by: Benjamin Marzinski Link: https://lore.kernel.org/r/20220118070444.1241739-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a6d4780580fc..b5e35e63adad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2976,6 +2976,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; + bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; -- cgit v1.2.3 From 3ee859e384d453d6ac68bfd5971f630d9fa46ad3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 9 Jan 2022 18:36:43 +0900 Subject: block: Fix wrong offset in bio_truncate() bio_truncate() clears the buffer outside of last block of bdev, however current bio_truncate() is using the wrong offset of page. So it can return the uninitialized data. This happened when both of truncated/corrupted FS and userspace (via bdev) are trying to read the last of bdev. Reported-by: syzbot+ac94ae5f68b84197f41c@syzkaller.appspotmail.com Signed-off-by: OGAWA Hirofumi Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/875yqt1c9g.fsf@mail.parknet.co.jp Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 0d400ba2dbd1..4312a8085396 100644 --- a/block/bio.c +++ b/block/bio.c @@ -569,7 +569,8 @@ static void bio_truncate(struct bio *bio, unsigned new_size) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); truncated = true; } done += bv.bv_len; -- cgit v1.2.3 From 46cdc45acb089c811d9a54fd50af33b96e5fae9d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Jan 2022 10:28:13 -0700 Subject: block: fix async_depth sysfs interface for mq-deadline A previous commit added this feature, but it inadvertently used the wrong variable to show/store the setting from/to, victimized by copy/paste. Fix it up so that the async_depth sysfs interface reads and writes from the right setting. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215485 Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 85d919bf60c7..3ed5eaf3446a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -865,7 +865,7 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -895,7 +895,7 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT -- cgit v1.2.3