From d308ae0d299a6bb15be4efb91849582d19c23213 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 26 Oct 2021 18:12:04 +0800 Subject: block: drain queue after disk is removed from sysfs Before removing disk from sysfs, userspace still may change queue via sysfs, such as switching elevator or setting wbt latency, both may reinitialize wbt, then the warning in blk_free_queue_stats() will be triggered since rq_qos_exit() is moved to del_gendisk(). Fixes the issue by moving draining queue & tearing down after disk is removed from sysfs, at that time no one can come into queue's store()/show(). Reported-by: Yi Zhang Tested-by: Yi Zhang Fixes: 8e141f9eb803 ("block: drain file system I/O on del_gendisk") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211026101204.2897166-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/genhd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b49858550fa6..ab12ae6e636e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -588,16 +588,6 @@ void del_gendisk(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(q); - blk_mq_freeze_queue_wait(q); - - rq_qos_exit(q); - blk_sync_queue(q); - blk_flush_integrity(); - /* - * Allow using passthrough request again after the queue is torn down. - */ - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); - __blk_mq_unfreeze_queue(q, true); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -620,6 +610,18 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + + blk_mq_freeze_queue_wait(q); + + rq_qos_exit(q); + blk_sync_queue(q); + blk_flush_integrity(); + /* + * Allow using passthrough request again after the queue is torn down. + */ + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } EXPORT_SYMBOL(del_gendisk); -- cgit v1.2.3 From 9586e67b911c95ba158fcc247b230e9c2d718623 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 27 Oct 2021 01:51:27 +0900 Subject: block: schedule queue restart after BLK_STS_ZONE_RESOURCE When dispatching a zone append write request to a SCSI zoned block device, if the target zone of the request is already locked, the device driver will return BLK_STS_ZONE_RESOURCE and the request will be pushed back to the hctx dipatch queue. The queue will be marked as RESTART in dd_finish_request() and restarted in __blk_mq_free_request(). However, this restart applies to the hctx of the completed request. If the requeued request is on a different hctx, dispatch will no be retried until another request is submitted or the next periodic queue run triggers, leading to up to 30 seconds latency for the requeued request. Fix this problem by scheduling a queue restart similarly to the BLK_STS_RESOURCE case or when we cannot get the budget. Also, consolidate the checks into the "need_resource" variable to simplify the condition. Signed-off-by: Naohiro Aota Reviewed-by: Christoph Hellwig Cc: Niklas Cassel Link: https://lore.kernel.org/r/20211026165127.4151055-1-naohiro.aota@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bc026372de43..652a31fc3bb3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1325,6 +1325,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; @@ -1370,6 +1371,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, queued++; break; case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; @@ -1380,6 +1383,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; break; default: errors++; @@ -1406,7 +1410,6 @@ out: /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); - bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; if (nr_budgets) blk_mq_release_budgets(q, list); @@ -1447,14 +1450,16 @@ out: * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do - * similar if we couldn't get budget and SCHED_RESTART is set. + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE || - no_budget_avail)) + else if (needs_restart && needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); -- cgit v1.2.3 From e0c60d0102a5ad3475401e1a2faa3d3623eefce4 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 26 Oct 2021 15:01:15 +0900 Subject: block: Fix partition check for host-aware zoned block devices Commit a33df75c6328 ("block: use an xarray for disk->part_tbl") modified the method to check partition existence in host-aware zoned block devices from disk_has_partitions() helper function call to empty check of xarray disk->part_tbl. However, disk->part_tbl always has single entry for disk->part0 and never becomes empty. This resulted in the host-aware zoned devices always judged to have partitions, and it made the sysfs queue/zoned attribute to be "none" instead of "host-aware" regardless of partition existence in the devices. This also caused DEBUG_LOCKS_WARN_ON(lock->magic != lock) for sdkp->rev_mutex in scsi layer when the kernel detects host-aware zoned device. Since block layer handled the host-aware zoned devices as non- zoned devices, scsi layer did not have chance to initialize the mutex for zone revalidation. Therefore, the warning was triggered. To fix the issues, call the helper function disk_has_partitions() in place of disk->part_tbl empty check. Since the function was removed with the commit a33df75c6328, reimplement it to walk through entries in the xarray disk->part_tbl. Fixes: a33df75c6328 ("block: use an xarray for disk->part_tbl") Signed-off-by: Shin'ichiro Kawasaki Cc: stable@vger.kernel.org # v5.14+ Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211026060115.753746-1-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index a7c857ad7d10..b880c70e22e4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); +static bool disk_has_partitions(struct gendisk *disk) +{ + unsigned long idx; + struct block_device *part; + bool ret = false; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part)) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + /** * blk_queue_set_zoned - configure a disk queue zoned model. * @disk: the gendisk of the queue to configure @@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - !xa_empty(&disk->part_tbl)) + disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: -- cgit v1.2.3