From a46c27026da10a126dd870f7b65380010bd20db5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 22 Mar 2024 10:12:44 +0800 Subject: blk-mq: don't schedule block kworker on isolated CPUs Kernel parameter of `isolcpus=` or 'nohz_full=' are used to isolate CPUs for specific task, and it isn't expected to let block IO disturb these CPUs. blk-mq kworker shouldn't be scheduled on isolated CPUs. Also if isolated CPUs is run for blk-mq kworker, long block IO latency can be caused. Kernel workqueue only respects CPU isolation for WQ_UNBOUND, for bound WQ, the responsibility is on user because CPU is specified as WQ API parameter, such as mod_delayed_work_on(cpu), queue_delayed_work_on(cpu) and queue_work_on(cpu). So not run blk-mq kworker on isolated CPUs by removing isolated CPUs from hctx->cpumask. Meantime use queue map to check if all CPUs in this hw queue are offline instead of hctx->cpumask, this way can avoid any cost in fast IO code path, and is safe since hctx->cpumask are only used in the two cases. Cc: Tim Chen Cc: Juri Lelli Cc: Andrew Theurer Cc: Joe Mario Cc: Sebastian Jug Cc: Frederic Weisbecker Cc: Bart Van Assche Cc: Tejun Heo Tesed-by: Joe Mario Signed-off-by: Ming Lei Reviewed-by: Ewan D. Milne Link: https://lore.kernel.org/r/20240322021244.1056223-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 32afb87efbd0..b8dbfed8b28b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -2163,6 +2164,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) return cpu; } +/* + * ->next_cpu is always calculated from hctx->cpumask, so simply use + * it for speeding up the check + */ +static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) +{ + return hctx->next_cpu >= nr_cpu_ids; +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -2174,7 +2184,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) bool tried = false; int next_cpu = hctx->next_cpu; - if (hctx->queue->nr_hw_queues == 1) + /* Switch to unbound if no allowable CPUs in this hctx */ + if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { @@ -3483,14 +3494,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) return data.has_rq; } -static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, - struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, + unsigned int this_cpu) { - if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) - return false; - if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) - return false; - return true; + enum hctx_type type = hctx->type; + int cpu; + + /* + * hctx->cpumask has to rule out isolated CPUs, but userspace still + * might submit IOs on these isolated CPUs, so use the queue map to + * check if all CPUs mapped to this hctx are offline + */ + for_each_online_cpu(cpu) { + struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, + type, cpu); + + if (h != hctx) + continue; + + /* this hctx has at least one online CPU */ + if (this_cpu != cpu) + return true; + } + + return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) @@ -3498,8 +3525,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); - if (!cpumask_test_cpu(cpu, hctx->cpumask) || - !blk_mq_last_cpu_in_hctx(cpu, hctx)) + if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* @@ -3907,6 +3933,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + int cpu; + /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -3933,6 +3961,15 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); + /* + * Rule out isolated CPUs from hctx->cpumask to avoid + * running block kworker on isolated CPUs + */ + for_each_cpu(cpu, hctx->cpumask) { + if (cpu_is_isolated(cpu)) + cpumask_clear_cpu(cpu, hctx->cpumask); + } + /* * Initialize batch roundrobin counts */ -- cgit v1.2.3 From d3a3a086ad57b8c05340c0a4ac97b26ea55a1119 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 27 Mar 2024 09:40:20 +0000 Subject: blk-throttle: Only use seq_printf() in tg_prfill_limit() Currently tg_prfill_limit() uses a combination of snprintf() and strcpy() to generate the values parts of the limits string, before passing them as arguments to seq_printf(). Convert to use only a sequence of seq_printf() calls per argument, which is simpler. Suggested-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240327094020.3505514-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f4850a6f860b..c515e1a96fad 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1494,11 +1494,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); - char bufs[4][21] = { "max", "max", "max", "max" }; u64 bps_dft; unsigned int iops_dft; - char idle_time[26] = ""; - char latency_time[26] = ""; if (!dname) return 0; @@ -1520,35 +1517,39 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; - if (tg->bps_conf[READ][off] != U64_MAX) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", - tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != U64_MAX) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", - tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != UINT_MAX) - snprintf(bufs[2], sizeof(bufs[2]), "%u", - tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != UINT_MAX) - snprintf(bufs[3], sizeof(bufs[3]), "%u", - tg->iops_conf[WRITE][off]); + seq_printf(sf, "%s", dname); + if (tg->bps_conf[READ][off] == U64_MAX) + seq_printf(sf, " rbps=max"); + else + seq_printf(sf, " rbps=%llu", tg->bps_conf[READ][off]); + + if (tg->bps_conf[WRITE][off] == U64_MAX) + seq_printf(sf, " wbps=max"); + else + seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE][off]); + + if (tg->iops_conf[READ][off] == UINT_MAX) + seq_printf(sf, " riops=max"); + else + seq_printf(sf, " riops=%u", tg->iops_conf[READ][off]); + + if (tg->iops_conf[WRITE][off] == UINT_MAX) + seq_printf(sf, " wiops=max"); + else + seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE][off]); + if (off == LIMIT_LOW) { if (tg->idletime_threshold_conf == ULONG_MAX) - strcpy(idle_time, " idle=max"); + seq_printf(sf, " idle=max"); else - snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold_conf); + seq_printf(sf, " idle=%lu", tg->idletime_threshold_conf); if (tg->latency_target_conf == ULONG_MAX) - strcpy(latency_time, " latency=max"); + seq_printf(sf, " latency=max"); else - snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target_conf); + seq_printf(sf, " latency=%lu", tg->latency_target_conf); } - - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, - latency_time); + seq_printf(sf, "\n"); return 0; } -- cgit v1.2.3 From 7a324d8389a1a1b6713eb689af6b457c72140ad6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Mar 2024 09:41:45 +0100 Subject: blk-cgroup: use bio_list_merge_init Use bio_list_merge_init instead of open coding bio_list_merge and bio_list_init. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240328084147.2954434-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bdbb557feb5a..8598e4591e79 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work) /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); + bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ -- cgit v1.2.3 From 688c8b9208356eb5c3fa8047f3e35666f3049a4d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 25 Mar 2024 11:59:55 +0800 Subject: blk-cgroup: use group allocation/free of per-cpu counters API Use group allocation/free of per-cpu counters api to accelerate blkg_rwstat_init/exit() and simplify code. Signed-off-by: Kefeng Wang Link: https://lore.kernel.org/r/20240325035955.50019-1-wangkefeng.wang@huawei.com Signed-off-by: Jens Axboe --- block/blk-cgroup-rwstat.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 3304e841df7c..a55fb0c53558 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { int i, ret; - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } + ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR); + if (ret) + return ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_set(&rwstat->aux_cnt[i], 0); - } return 0; } EXPORT_SYMBOL_GPL(blkg_rwstat_init); void blkg_rwstat_exit(struct blkg_rwstat *rwstat) { - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); + percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR); } EXPORT_SYMBOL_GPL(blkg_rwstat_exit); -- cgit v1.2.3 From de4c7bef9d330e4a59c78181bd596c7569d7208e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 15 Apr 2024 12:20:20 +0000 Subject: block: Call blkdev_dio_unaligned() from blkdev_direct_IO() blkdev_dio_unaligned() is called from __blkdev_direct_IO(), __blkdev_direct_IO_simple(), and __blkdev_direct_IO_async(), and all these are only called from blkdev_direct_IO(). Move the blkdev_dio_unaligned() call to the common callsite, blkdev_direct_IO(). Pass those functions the bdev pointer from blkdev_direct_IO(), as it is non-trivial to look up. Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240415122020.1541594-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 679d9b752fe8..c091ea43bca3 100644 --- a/block/fops.c +++ b/block/fops.c @@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, - struct iov_iter *iter, unsigned int nr_pages) + struct iov_iter *iter, struct block_device *bdev, + unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio) } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; + if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + return -EINVAL; + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO_async(iocb, iter, nr_pages); + return __blkdev_direct_IO_simple(iocb, iter, bdev, + nr_pages); + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, -- cgit v1.2.3 From 6f8fd758de63bab513c551bb1796a14f8cdc40d9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:01 +0900 Subject: block: Restore sector of flush requests On completion of a flush sequence, blk_flush_restore_request() restores the bio of a request to the original submitted BIO. However, the last use of the request in the flush sequence may have been for a POSTFLUSH which does not have a sector. So make sure to restore the request sector using the iter sector of the original BIO. This BIO has not changed yet since the completions of the flush sequence intermediate steps use requeueing of the request until all steps are completed. Restoring the request sector ensures that blk_mq_end_request() will see a valid sector as originally set when the flush BIO was submitted. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-flush.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index b0f314f4bc14..2f58ae018464 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,6 +130,7 @@ static void blk_flush_restore_request(struct request *rq) * original @rq->bio. Restore it. */ rq->bio = rq->biotail; + rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; -- cgit v1.2.3 From c0da26f950a355ef3540ca8d215351e1ed4cac47 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:02 +0900 Subject: block: Remove req_bio_endio() Moving req_bio_endio() code into its only caller, blk_update_request(), allows reducing accesses to and tests of bio and request fields. Also, given that partial completions of zone append operations is not possible and that zone append operations cannot be merged, the update of the BIO sector using the request sector for these operations can be moved directly before the call to bio_endio(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 58 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b8dbfed8b28b..fcbf0953a179 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -762,31 +762,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (unlikely(error)) { - bio->bi_status = error; - } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size != nbytes) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - bio_advance(bio, nbytes); - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); -} - static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { @@ -890,6 +865,8 @@ static void blk_complete_request(struct request *req) bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { + bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; + bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); @@ -910,9 +887,8 @@ bool blk_update_request(struct request *req, blk_status_t error, if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET)) && - !test_bit(GD_DEAD, &req->q->disk->state)) { + if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && + !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } @@ -924,12 +900,34 @@ bool blk_update_request(struct request *req, blk_status_t error, struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (bio_bytes == bio->bi_iter.bi_size) + if (unlikely(error)) + bio->bi_status = error; + + if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; + } else if (req_op(req) == REQ_OP_ZONE_APPEND && + error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported + * as the BIO fragments may end up not being written + * sequentially. + */ + bio->bi_status = BLK_STS_IOERR; + } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); + if (unlikely(quiet)) + bio_set_flag(bio, BIO_QUIET); + + bio_advance(bio, bio_bytes); + + /* Don't actually finish bio if it's part of flush sequence */ + if (!bio->bi_iter.bi_size && !is_flush) { + if (req_op(req) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = req->__sector; + bio_endio(bio); + } total_bytes += bio_bytes; nr_bytes -= bio_bytes; -- cgit v1.2.3 From a0508c36efa838b16aa93a23e3583d68d3ef6c33 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:03 +0900 Subject: block: Introduce blk_zone_update_request_bio() On completion of a zone append request, the request sector indicates the location of the written data. This value must be returned to the user through the BIO iter sector. This is done in 2 places: in blk_complete_request() and in blk_update_request(). Introduce the inline helper function blk_zone_update_request_bio() to avoid duplicating this BIO update for zone append requests, and to compile out this helper call when CONFIG_BLK_DEV_ZONED is not enabled. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++------ block/blk.h | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fcbf0953a179..88b541e8873f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -821,8 +821,7 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - if (req_op(req) == REQ_OP_ZONE_APPEND) - bio->bi_iter.bi_sector = req->__sector; + blk_zone_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -923,10 +922,10 @@ bool blk_update_request(struct request *req, blk_status_t error, bio_advance(bio, bio_bytes); /* Don't actually finish bio if it's part of flush sequence */ - if (!bio->bi_iter.bi_size && !is_flush) { - if (req_op(req) == REQ_OP_ZONE_APPEND) - bio->bi_iter.bi_sector = req->__sector; - bio_endio(bio); + if (!bio->bi_iter.bi_size) { + blk_zone_update_request_bio(req, bio); + if (!is_flush) + bio_endio(bio); } total_bytes += bio_bytes; diff --git a/block/blk.h b/block/blk.h index d9f584984bc4..17786052f32d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -408,12 +408,29 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) +{ + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + */ + if (req_op(rq) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = rq->__sector; +} int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ -static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} +static inline void disk_free_zone_bitmaps(struct gendisk *disk) +{ +} +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) +{ +} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { -- cgit v1.2.3 From dd850ff3eee428b4e1276bd51263dd937643ba19 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:05 +0900 Subject: block: Allow using bio_attempt_back_merge() internally Remove "static" from the definition of bio_attempt_back_merge() and declare this function in block/blk.h to allow using it internally from other block layer files. The definition of enum bio_merge_status is also moved to block/blk.h. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-6-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 +------- block/blk.h | 8 ++++++++ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 4e3483a16b75..3363b1321908 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -972,13 +972,7 @@ static void blk_account_io_merge_bio(struct request *req) part_stat_unlock(); } -enum bio_merge_status { - BIO_MERGE_OK, - BIO_MERGE_NONE, - BIO_MERGE_FAILED, -}; - -static enum bio_merge_status bio_attempt_back_merge(struct request *req, +enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { const blk_opf_t ff = bio_failfast(bio); diff --git a/block/blk.h b/block/blk.h index 17786052f32d..bca50a9510c8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -269,6 +269,14 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, unsigned int nr_segs); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, -- cgit v1.2.3 From ecfe43b11b02ffeb24c203af7d3947417d412cf7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:06 +0900 Subject: block: Remember zone capacity when revalidating zones In preparation for adding zone write plugging, modify blk_revalidate_disk_zones() to get the capacity of zones of a zoned block device. This capacity value as a number of 512B sectors is stored in the gendisk zone_capacity field. Given that host-managed SMR disks (including zoned UFS drives) and all known NVMe ZNS devices have the same zone capacity for all zones blk_revalidate_disk_zones() returns an error if different capacities are detected for different zones. This also adds check to verify that the values reported by the device for zone capacities are correct, that is, that the zone capacity is never 0, does not exceed the zone size and is equal to the zone size for conventional zones. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-7-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 ++++++++++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 27 insertions(+) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index da0f4b2a8fa0..23d9bb21c459 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -438,6 +438,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int nr_zones; + unsigned int zone_capacity; sector_t sector; }; @@ -482,9 +483,20 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (!zone->capacity || zone->capacity > zone->len) { + pr_warn("%s: Invalid zone capacity\n", + disk->disk_name); + return -ENODEV; + } + /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: + if (zone->capacity != zone->len) { + pr_warn("%s: Invalid conventional zone capacity\n", + disk->disk_name); + return -ENODEV; + } if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -500,6 +512,18 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, if (!args->seq_zones_wlock) return -ENOMEM; } + + /* + * Remember the capacity of the first sequential zone and check + * if it is constant for all zones. + */ + if (!args->zone_capacity) + args->zone_capacity = zone->capacity; + if (zone->capacity != args->zone_capacity) { + pr_warn("%s: Invalid variable zone capacity\n", + disk->disk_name); + return -ENODEV; + } break; case BLK_ZONE_TYPE_SEQWRITE_PREF: default: @@ -595,6 +619,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, blk_mq_freeze_queue(q); if (ret > 0) { disk->nr_zones = args.nr_zones; + disk->zone_capacity = args.zone_capacity; swap(disk->seq_zones_wlock, args.seq_zones_wlock); swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); if (update_driver_data) @@ -608,6 +633,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, kfree(args.seq_zones_wlock); kfree(args.conv_zones_bitmap); + return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ec7bd7091467..4e81f714cca7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,6 +191,7 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; + unsigned int zone_capacity; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From dd291d77cc90eb6a86e9860ba8e6e38eebd57d12 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:07 +0900 Subject: block: Introduce zone write plugging Zone write plugging implements a per-zone "plug" for write operations to control the submission and execution order of write operations to sequential write required zones of a zoned block device. Per-zone plugging guarantees that at any time there is at most only one write request per zone being executed. This mechanism is intended to replace zone write locking which implements a similar per-zone write throttling at the scheduler level, but is implemented only by mq-deadline. Unlike zone write locking which operates on requests, zone write plugging operates on BIOs. A zone write plug is simply a BIO list that is atomically manipulated using a spinlock and a kblockd submission work. A write BIO to a zone is "plugged" to delay its execution if a write BIO for the same zone was already issued, that is, if a write request for the same zone is being executed. The next plugged BIO is unplugged and issued once the write request completes. This mechanism allows to: - Untangle zone write ordering from block IO schedulers. This allows removing the restriction on using mq-deadline for writing to zoned block devices. Any block IO scheduler, including "none" can be used. - Zone write plugging operates on BIOs instead of requests. Plugged BIOs waiting for execution thus do not hold scheduling tags and thus are not preventing other BIOs from executing (reads or writes to other zones). Depending on the workload, this can significantly improve the device use (higher queue depth operation) and performance. - Both blk-mq (request based) zoned devices and BIO-based zoned devices (e.g. device mapper) can use zone write plugging. It is mandatory for the former but optional for the latter. BIO-based drivers can use zone write plugging to implement write ordering guarantees, or the drivers can implement their own if needed. - The code is less invasive in the block layer and is mostly limited to blk-zoned.c with some small changes in blk-mq.c, blk-merge.c and bio.c. Zone write plugging is implemented using struct blk_zone_wplug. This structure includes a spinlock, a BIO list and a work structure to handle the submission of plugged BIOs. Zone write plugs structures are managed using a per-disk hash table. Plugging of zone write BIOs is done using the function blk_zone_write_plug_bio() which returns false if a BIO execution does not need to be delayed and true otherwise. This function is called from blk_mq_submit_bio() after a BIO is split to avoid large BIOs spanning multiple zones which would cause mishandling of zone write plugs. This ichange enables by default zone write plugging for any mq request-based block device. BIO-based device drivers can also use zone write plugging by expliclty calling blk_zone_write_plug_bio() in their ->submit_bio method. For such devices, the driver must ensure that a BIO passed to blk_zone_write_plug_bio() is already split and not straddling zone boundaries. Only write and write zeroes BIOs are plugged. Zone write plugging does not introduce any significant overhead for other operations. A BIO that is being handled through zone write plugging is flagged using the new BIO flag BIO_ZONE_WRITE_PLUGGING. A request handling a BIO flagged with this new flag is flagged with the new RQF_ZONE_WRITE_PLUGGING flag. The completion of BIOs and requests flagged trigger respectively calls to the functions blk_zone_write_bio_endio() and blk_zone_write_complete_request(). The latter function is used to trigger submission of the next plugged BIO using the zone plug work. blk_zone_write_bio_endio() does the same for BIO-based devices. This ensures that at any time, at most one request (blk-mq devices) or one BIO (BIO-based devices) is being executed for any zone. The handling of zone write plugs using a per-zone plug spinlock maximizes parallelism and device usage by allowing multiple zones to be writen simultaneously without lock contention. Zone write plugging ignores flush BIOs without data. Hovever, any flush BIO that has data is always plugged so that the write part of the flush sequence is serialized with other regular writes. Given that any BIO handled through zone write plugging will be the only BIO in flight for the target zone when it is executed, the unplugging and submission of a BIO will have no chance of successfully merging with plugged requests or requests in the scheduler. To overcome this potential performance degradation, blk_mq_submit_bio() calls the function blk_zone_write_plug_attempt_merge() to try to merge other plugged BIOs with the one just unplugged and submitted. Successful merging is signaled using blk_zone_write_plug_bio_merged(), called from bio_attempt_back_merge(). Furthermore, to avoid recalculating the number of segments of plugged BIOs to attempt merging, the number of segments of a plugged BIO is saved using the new struct bio field __bi_nr_segments. To avoid growing the size of struct bio, this field is added as a union with the bio_cookie field. This is safe to do as polling is always disabled for plugged BIOs. When BIOs are plugged in a zone write plug, the device request queue usage counter is always incremented. This reference is kept and reused for blk-mq devices when the plugged BIO is unplugged and submitted again using submit_bio_noacct_nocheck(). For this case, the unplugged BIO is already flagged with BIO_ZONE_WRITE_PLUGGING and blk_mq_submit_bio() proceeds directly to allocating a new request for the BIO, re-using the usage reference count taken when the BIO was plugged. This extra reference count is dropped in blk_zone_write_plug_attempt_merge() for any plugged BIO that is successfully merged. Given that BIO-based devices will not take this path, the extra reference is dropped after a plugged BIO is unplugged and submitted. Zone write plugs are dynamically allocated and managed using a hash table (an array of struct hlist_head) with RCU protection. A zone write plug is allocated when a write BIO is received for the zone and not freed until the zone is fully written, reset or finished. To detect when a zone write plug can be freed, the write state of each zone is tracked using a write pointer offset which corresponds to the offset of a zone write pointer relative to the zone start. Write operations always increment this write pointer offset. Zone reset operations set it to 0 and zone finish operations set it to the zone size. If a write error happens, the wp_offset value of a zone write plug may become incorrect and out of sync with the device managed write pointer. This is handled using the zone write plug flag BLK_ZONE_WPLUG_ERROR. The function blk_zone_wplug_handle_error() is called from the new disk zone write plug work when this flag is set. This function executes a report zone to update the zone write pointer offset to the current value as indicated by the device. The disk zone write plug work is scheduled whenever a BIO flagged with BIO_ZONE_WRITE_PLUGGING completes with an error or when bio_zone_wplug_prepare_bio() detects an unaligned write. Once scheduled, the disk zone write plugs work keeps running until all zone errors are handled. To match the new data structures used for zoned disks, the function disk_free_zone_bitmaps() is renamed to the more generic disk_free_zone_resources(). The function disk_init_zone_resources() is also introduced to initialize zone write plugs resources when a gendisk is allocated. In order to guarantee that the user can simultaneously write up to a number of zones equal to a device max active zone limit or max open zone limit, zone write plugs are allocated using a mempool sized to the maximum of these 2 device limits. For a device that does not have active and open zone limits, 128 is used as the default mempool size. If a change to the device active and open zone limits is detected, the disk mempool is resized when blk_revalidate_disk_zones() is executed. This commit contains contributions from Christoph Hellwig . Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240408014128.205141-8-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 2 + block/blk-merge.c | 12 + block/blk-mq.c | 32 +- block/blk-zoned.c | 1091 ++++++++++++++++++++++++++++++++++++++++++++- block/blk.h | 51 ++- block/genhd.c | 3 +- include/linux/blk-mq.h | 2 + include/linux/blk_types.h | 8 +- include/linux/blkdev.h | 11 + 9 files changed, 1201 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d24420ed1c4c..38baedb39c6f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1576,6 +1576,8 @@ again: if (!bio_integrity_endio(bio)) return; + blk_zone_bio_endio(bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 3363b1321908..7f8a808b74c1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, blkcg_bio_issue_init(split); bio_chain(split, bio); trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); submit_bio_noacct(bio); return split; } @@ -988,6 +989,9 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req, blk_update_mixed_merge(req, bio, false); + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_bio_merged(bio); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -1003,6 +1007,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, { const blk_opf_t ff = bio_failfast(bio); + /* + * A front merge for writes to sequential zones of a zoned block device + * can happen only if the user submitted writes out of order. Do not + * merge such write to let it fail. + */ + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + return BIO_MERGE_FAILED; + if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; diff --git a/block/blk-mq.c b/block/blk-mq.c index 88b541e8873f..48eb7dd049d1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -828,6 +828,8 @@ static void blk_complete_request(struct request *req) bio = next; } while (bio); + blk_zone_complete_request(req); + /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request @@ -939,6 +941,7 @@ bool blk_update_request(struct request *req, blk_status_t error, * completely done */ if (!req->bio) { + blk_zone_complete_request(req); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request @@ -2963,15 +2966,30 @@ void blk_mq_submit_bio(struct bio *bio) struct request *rq; blk_status_t ret; + /* + * If the plug has a cached request for this queue, try to use it. + */ + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + + /* + * A BIO that was released from a zone write plug has already been + * through the preparation in this function, already holds a reference + * on the queue usage counter, and is the only write BIO in-flight for + * the target zone. Go straight to preparing a request for it. + */ + if (bio_zone_write_plugging(bio)) { + nr_segs = bio->__bi_nr_segments; + if (rq) + blk_queue_exit(q); + goto new_request; + } + bio = blk_queue_bounce(bio, q); /* - * If the plug has a cached request for this queue, try use it. - * * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ - rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); if (!rq) { if (unlikely(bio_queue_enter(bio))) return; @@ -2988,6 +3006,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; + if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + +new_request: if (!rq) { rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) @@ -3006,6 +3028,7 @@ void blk_mq_submit_bio(struct bio *bio) if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); + blk_zone_complete_request(rq); blk_mq_free_request(rq); return; } @@ -3013,6 +3036,9 @@ void blk_mq_submit_bio(struct bio *bio) if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_attempt_merge(rq); + if (plug) { blk_add_rq_to_plug(plug, rq); return; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23d9bb21c459..fefcebd70445 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -7,6 +7,7 @@ * * Copyright (c) 2016, Damien Le Moal * Copyright (c) 2016, Western Digital + * Copyright (c) 2024, Western Digital Corporation or its affiliates. */ #include @@ -16,8 +17,12 @@ #include #include #include +#include +#include +#include #include "blk.h" +#include "blk-mq-sched.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -32,6 +37,64 @@ static const char *const zone_cond_name[] = { }; #undef ZONE_COND_NAME +/* + * Per-zone write plug. + * @node: hlist_node structure for managing the plug using a hash table. + * @link: To list the plug in the zone write plug error list of the disk. + * @ref: Zone write plug reference counter. A zone write plug reference is + * always at least 1 when the plug is hashed in the disk plug hash table. + * The reference is incremented whenever a new BIO needing plugging is + * submitted and when a function needs to manipulate a plug. The + * reference count is decremented whenever a plugged BIO completes and + * when a function that referenced the plug returns. The initial + * reference is dropped whenever the zone of the zone write plug is reset, + * finished and when the zone becomes full (last write BIO to the zone + * completes). + * @lock: Spinlock to atomically manipulate the plug. + * @flags: Flags indicating the plug state. + * @zone_no: The number of the zone the plug is managing. + * @wp_offset: The zone write pointer location relative to the start of the zone + * as a number of 512B sectors. + * @bio_list: The list of BIOs that are currently plugged. + * @bio_work: Work struct to handle issuing of plugged BIOs + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. + * @disk: The gendisk the plug belongs to. + */ +struct blk_zone_wplug { + struct hlist_node node; + struct list_head link; + atomic_t ref; + spinlock_t lock; + unsigned int flags; + unsigned int zone_no; + unsigned int wp_offset; + struct bio_list bio_list; + struct work_struct bio_work; + struct rcu_head rcu_head; + struct gendisk *disk; +}; + +/* + * Zone write plug flags bits: + * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, + * that is, that write BIOs are being throttled due to a write BIO already + * being executed or the zone write plug bio list is not empty. + * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be + * recovered with a report zone to update the zone write pointer offset. + * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed + * from the disk hash table and that the initial reference to the zone + * write plug set when the plug was first added to the hash table has been + * dropped. This flag is set when a zone is reset, finished or become full, + * to prevent new references to the zone write plug to be taken for + * newly incoming BIOs. A zone write plug flagged with this flag will be + * freed once all remaining references from BIOs or functions are dropped. + */ +#define BLK_ZONE_WPLUG_PLUGGED (1U << 0) +#define BLK_ZONE_WPLUG_ERROR (1U << 1) +#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) + +#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) + /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. @@ -425,12 +488,1010 @@ fail: return ret; } -void disk_free_zone_bitmaps(struct gendisk *disk) +static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) +{ + if (!disk->conv_zones_bitmap) + return false; + return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); +} + +static bool disk_insert_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + struct blk_zone_wplug *zwplg; + unsigned long flags; + unsigned int idx = + hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); + + /* + * Add the new zone write plug to the hash table, but carefully as we + * are racing with other submission context, so we may already have a + * zone write plug for the same zone. + */ + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { + if (zwplg->zone_no == zwplug->zone_no) { + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + return false; + } + } + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + return true; +} + +static void disk_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; + atomic_dec(&zwplug->ref); + hlist_del_init_rcu(&zwplug->node); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + +static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* If the zone is still busy, the plug cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + return false; + + /* We can remove zone write plugs for zones that are empty or full. */ + return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; +} + +static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + unsigned int zno = disk_zone_no(disk, sector); + unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); + struct blk_zone_wplug *zwplug; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { + if (zwplug->zone_no == zno && + atomic_inc_not_zero(&zwplug->ref)) { + rcu_read_unlock(); + return zwplug; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) +{ + struct blk_zone_wplug *zwplug = + container_of(rcu_head, struct blk_zone_wplug, rcu_head); + + mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); +} + +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (atomic_dec_and_test(&zwplug->ref)) { + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); + WARN_ON_ONCE(!list_empty(&zwplug->link)); + + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); + } +} + +static void blk_zone_wplug_bio_work(struct work_struct *work); + +/* + * Get a reference on the write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and hashed. + * Return a pointer to the zone write plug with the plug spinlock held. + */ +static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask, + unsigned long *flags) { + unsigned int zno = disk_zone_no(disk, sector); + struct blk_zone_wplug *zwplug; + +again: + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + /* + * Check that a BIO completion or a zone reset or finish + * operation has not already removed the zone write plug from + * the hash table and dropped its reference count. In such case, + * we need to get a new plug so start over from the beginning. + */ + spin_lock_irqsave(&zwplug->lock, *flags); + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + disk_put_zone_wplug(zwplug); + goto again; + } + return zwplug; + } + + /* + * Allocate and initialize a zone write plug with an extra reference + * so that it is not freed when the zone write plug becomes idle without + * the zone being full. + */ + zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); + if (!zwplug) + return NULL; + + INIT_HLIST_NODE(&zwplug->node); + INIT_LIST_HEAD(&zwplug->link); + atomic_set(&zwplug->ref, 2); + spin_lock_init(&zwplug->lock); + zwplug->flags = 0; + zwplug->zone_no = zno; + zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); + bio_list_init(&zwplug->bio_list); + INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + zwplug->disk = disk; + + spin_lock_irqsave(&zwplug->lock, *flags); + + /* + * Insert the new zone write plug in the hash table. This can fail only + * if another context already inserted a plug. Retry from the beginning + * in such case. + */ + if (!disk_insert_zone_wplug(disk, zwplug)) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + mempool_free(zwplug, disk->zone_wplugs_pool); + goto again; + } + + return zwplug; +} + +static inline void blk_zone_wplug_bio_io_error(struct bio *bio) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + bio_io_error(bio); + blk_queue_exit(q); +} + +/* + * Abort (fail) all plugged BIOs of a zone write plug. + */ +static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) +{ + struct bio *bio; + + while ((bio = bio_list_pop(&zwplug->bio_list))) { + blk_zone_wplug_bio_io_error(bio); + disk_put_zone_wplug(zwplug); + } +} + +/* + * Abort (fail) all plugged BIOs of a zone write plug that are not aligned + * with the assumed write pointer location of the zone when the BIO will + * be unplugged. + */ +static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned int zone_capacity = disk->zone_capacity; + unsigned int wp_offset = zwplug->wp_offset; + struct bio_list bl = BIO_EMPTY_LIST; + struct bio *bio; + + while ((bio = bio_list_pop(&zwplug->bio_list))) { + if (wp_offset >= zone_capacity || + bio_offset_from_zone_start(bio) != wp_offset) { + blk_zone_wplug_bio_io_error(bio); + disk_put_zone_wplug(zwplug); + continue; + } + + wp_offset += bio_sectors(bio); + bio_list_add(&bl, bio); + } + + bio_list_merge(&zwplug->bio_list, &bl); +} + +/* + * Set a zone write plug write pointer offset to either 0 (zone reset case) + * or to the zone size (zone finish case). This aborts all plugged BIOs, which + * is fine to do as doing a zone reset or zone finish while writes are in-flight + * is a mistake from the user which will most likely cause all plugged BIOs to + * fail anyway. + */ +static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, + struct blk_zone_wplug *zwplug, + unsigned int wp_offset) +{ + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * Make sure that a BIO completion or another zone reset or finish + * operation has not already removed the plug from the hash table. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { + spin_unlock_irqrestore(&zwplug->lock, flags); + return; + } + + /* Update the zone write pointer and abort all plugged BIOs. */ + zwplug->wp_offset = wp_offset; + disk_zone_wplug_abort(zwplug); + + /* + * Updating the write pointer offset puts back the zone + * in a good state. So clear the error flag and decrement the + * error count if we were in error state. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; + spin_lock(&disk->zone_wplugs_lock); + list_del_init(&zwplug->link); + spin_unlock(&disk->zone_wplugs_lock); + } + + /* + * The zone write plug now has no BIO plugged: remove it from the + * hash table so that it cannot be seen. The plug will be freed + * when the last reference is dropped. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, + unsigned int wp_offset) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + + /* Conventional zones cannot be reset nor finished. */ + if (disk_zone_is_conv(disk, sector)) { + bio_io_error(bio); + return true; + } + + /* + * If we have a zone write plug, set its write pointer offset to 0 + * (reset case) or to the zone size (finish case). This will abort all + * BIOs plugged for the target zone. It is fine as resetting or + * finishing zones while writes are still in-flight will result in the + * writes failing anyway. + */ + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + disk_put_zone_wplug(zwplug); + } + + return false; +} + +static bool blk_zone_wplug_handle_reset_all(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + sector_t sector; + + /* + * Set the write pointer offset of all zone write plugs to 0. This will + * abort all plugged BIOs. It is fine as resetting zones while writes + * are still in-flight will result in the writes failing anyway. + */ + for (sector = 0; sector < get_capacity(disk); + sector += disk->queue->limits.chunk_sectors) { + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + disk_put_zone_wplug(zwplug); + } + } + + return false; +} + +static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, + struct bio *bio, unsigned int nr_segs) +{ + /* + * Grab an extra reference on the BIO request queue usage counter. + * This reference will be reused to submit a request for the BIO for + * blk-mq devices and dropped when the BIO is failed and after + * it is issued in the case of BIO-based devices. + */ + percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); + + /* + * The BIO is being plugged and thus will have to wait for the on-going + * write and for all other writes already plugged. So polling makes + * no sense. + */ + bio_clear_polled(bio); + + /* + * Reuse the poll cookie field to store the number of segments when + * split to the hardware limits. + */ + bio->__bi_nr_segments = nr_segs; + + /* + * We always receive BIOs after they are split and ready to be issued. + * The block layer passes the parts of a split BIO in order, and the + * user must also issue write sequentially. So simply add the new BIO + * at the tail of the list to preserve the sequential write order. + */ + bio_list_add(&zwplug->bio_list, bio); +} + +/* + * Called from bio_attempt_back_merge() when a BIO was merged with a request. + */ +void blk_zone_write_plug_bio_merged(struct bio *bio) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * If the BIO was already plugged, then we were called through + * blk_zone_write_plug_attempt_merge() -> blk_attempt_bio_merge(). + * For this case, blk_zone_write_plug_attempt_merge() will handle the + * zone write pointer offset update. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return; + + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * Increase the plug reference count and advance the zone write + * pointer offset. + */ + zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, + bio->bi_iter.bi_sector); + spin_lock_irqsave(&zwplug->lock, flags); + zwplug->wp_offset += bio_sectors(bio); + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +/* + * Attempt to merge plugged BIOs with a newly prepared request for a BIO that + * already went through zone write plugging (either a new BIO or one that was + * unplugged). + */ +void blk_zone_write_plug_attempt_merge(struct request *req) +{ + sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); + struct request_queue *q = req->q; + struct gendisk *disk = q->disk; + unsigned int zone_capacity = disk->zone_capacity; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(disk, blk_rq_pos(req)); + unsigned long flags; + struct bio *bio; + + /* + * Completion of this request needs to be handled with + * blk_zone_write_plug_complete_request(). + */ + req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; + + if (blk_queue_nomerges(q)) + return; + + /* + * Walk through the list of plugged BIOs to check if they can be merged + * into the back of the request. + */ + spin_lock_irqsave(&zwplug->lock, flags); + while (zwplug->wp_offset < zone_capacity) { + bio = bio_list_peek(&zwplug->bio_list); + if (!bio) + break; + + if (bio->bi_iter.bi_sector != req_back_sector || + !blk_rq_merge_ok(req, bio)) + break; + + WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && + !bio->__bi_nr_segments); + + bio_list_pop(&zwplug->bio_list); + if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != + BIO_MERGE_OK) { + bio_list_add_head(&zwplug->bio_list, bio); + break; + } + + /* + * Drop the extra reference on the queue usage we got when + * plugging the BIO and advance the write pointer offset. + */ + blk_queue_exit(q); + zwplug->wp_offset += bio_sectors(bio); + + req_back_sector += bio_sectors(bio); + } + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +static inline void disk_zone_wplug_set_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) { + unsigned long flags; + + /* + * Increase the plug reference count. The reference will be + * dropped in disk_zone_wplugs_work() once the error state + * is handled. + */ + zwplug->flags |= BLK_ZONE_WPLUG_ERROR; + atomic_inc(&zwplug->ref); + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + } +} + +/* + * Check and prepare a BIO for submission by incrementing the write pointer + * offset of its zone write plug. + */ +static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, + struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + /* + * Check that the user is not attempting to write to a full zone. + * We know such BIO will fail, and that would potentially overflow our + * write pointer offset beyond the end of the zone. + */ + if (zwplug->wp_offset >= disk->zone_capacity) + goto err; + + /* + * Check for non-sequential writes early because we avoid a + * whole lot of error handling trouble if we don't send it off + * to the driver. + */ + if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) + goto err; + + /* Advance the zone write pointer offset. */ + zwplug->wp_offset += bio_sectors(bio); + + return true; + +err: + /* We detected an invalid write BIO: schedule error recovery. */ + disk_zone_wplug_set_error(disk, zwplug); + kblockd_schedule_work(&disk->zone_wplugs_work); + return false; +} + +static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + gfp_t gfp_mask = GFP_NOIO; + unsigned long flags; + + /* + * BIOs must be fully contained within a zone so that we use the correct + * zone write plug for the entire BIO. For blk-mq devices, the block + * layer should already have done any splitting required to ensure this + * and this BIO should thus not be straddling zone boundaries. For + * BIO-based devices, it is the responsibility of the driver to split + * the bio before submitting it. + */ + if (WARN_ON_ONCE(bio_straddles_zones(bio))) { + bio_io_error(bio); + return true; + } + + /* Conventional zones do not need write plugging. */ + if (disk_zone_is_conv(disk, sector)) + return false; + + if (bio->bi_opf & REQ_NOWAIT) + gfp_mask = GFP_NOWAIT; + + zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + if (!zwplug) { + bio_io_error(bio); + return true; + } + + /* Indicate that this BIO is being handled using zone write plugging. */ + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If the zone is already plugged or has a pending error, add the BIO + * to the plug BIO list. Otherwise, plug and let the BIO execute. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + goto plug; + + /* + * If an error is detected when preparing the BIO, add it to the BIO + * list so that error recovery can deal with it. + */ + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) + goto plug; + + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return false; + +plug: + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + blk_zone_wplug_add_bio(zwplug, bio, nr_segs); + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return true; +} + +/** + * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging + * @bio: The BIO being submitted + * @nr_segs: The number of physical segments of @bio + * + * Handle write and write zeroes operations using zone write plugging. + * + * Return true whenever @bio execution needs to be delayed through the zone + * write plug. Otherwise, return false to let the submission path process + * @bio normally. + */ +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + struct block_device *bdev = bio->bi_bdev; + + if (!bdev->bd_disk->zone_wplugs_hash) + return false; + + /* + * If the BIO already has the plugging flag set, then it was already + * handled through this path and this is a submission from the zone + * plug bio submit work. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * We do not need to do anything special for empty flush BIOs, e.g + * BIOs such as issued by blkdev_issue_flush(). The is because it is + * the responsibility of the user to first wait for the completion of + * write operations for flush to have any effect on the persistence of + * the written data. + */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* + * Regular writes and write zeroes need to be handled through the target + * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH + * which may need to go through the flush machinery depending on the + * target device capabilities. Plugging such writes is fine as the flush + * machinery operates at the request level, below the plug, and + * completion of the flush sequence will go through the regular BIO + * completion, which will handle zone write plugging. + * Zone reset, reset all and finish commands need special treatment + * to correctly track the write pointer offset of zones. These commands + * are not plugged as we do not need serialization with write + * operations. It is the responsibility of the user to not issue reset + * and finish commands when write operations are in flight. + */ + switch (bio_op(bio)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return blk_zone_wplug_handle_write(bio, nr_segs); + case REQ_OP_ZONE_RESET: + return blk_zone_wplug_handle_reset_or_finish(bio, 0); + case REQ_OP_ZONE_FINISH: + return blk_zone_wplug_handle_reset_or_finish(bio, + bdev_zone_sectors(bdev)); + case REQ_OP_ZONE_RESET_ALL: + return blk_zone_wplug_handle_reset_all(bio); + default: + return false; + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_zone_plug_bio); + +static void disk_zone_wplug_unplug_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * If we had an error, schedule error recovery. The recovery work + * will restart submission of plugged BIOs. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { + spin_unlock_irqrestore(&zwplug->lock, flags); + kblockd_schedule_work(&disk->zone_wplugs_work); + return; + } + + /* Schedule submission of the next plugged BIO if we have one. */ + if (!bio_list_empty(&zwplug->bio_list)) { + spin_unlock_irqrestore(&zwplug->lock, flags); + kblockd_schedule_work(&zwplug->bio_work); + return; + } + + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If the zone is full (it was fully written or finished, or empty + * (it was reset), remove its zone write plug from the hash table. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +void blk_zone_write_plug_bio_endio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(bio->bi_bdev->bd_disk, + bio->bi_iter.bi_sector); + unsigned long flags; + + if (WARN_ON_ONCE(!zwplug)) + return; + + /* Make sure we do not see this BIO again by clearing the plug flag. */ + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If the BIO failed, mark the plug as having an error to trigger + * recovery. + */ + if (bio->bi_status != BLK_STS_OK) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_error(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + } + + /* + * For BIO-based devices, blk_zone_write_plug_complete_request() + * is not called. So we need to schedule execution of the next + * plugged BIO here. + */ + if (bio->bi_bdev->bd_has_submit_bio) + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when the BIO was issued. */ + atomic_dec(&zwplug->ref); + disk_put_zone_wplug(zwplug); +} + +void blk_zone_write_plug_complete_request(struct request *req) +{ + struct gendisk *disk = req->q->disk; + struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, req->__sector); + + if (WARN_ON_ONCE(!zwplug)) + return; + + req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; + + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* + * Drop the reference we took when the request was initialized in + * blk_zone_write_plug_attempt_merge(). + */ + atomic_dec(&zwplug->ref); + disk_put_zone_wplug(zwplug); +} + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + struct block_device *bdev; + unsigned long flags; + struct bio *bio; + + /* + * Submit the next plugged BIO. If we do not have any, clear + * the plugged flag. + */ + spin_lock_irqsave(&zwplug->lock, flags); + + bio = bio_list_pop(&zwplug->bio_list); + if (!bio) { + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + spin_unlock_irqrestore(&zwplug->lock, flags); + return; + } + + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + /* Error recovery will decide what to do with the BIO. */ + bio_list_add_head(&zwplug->bio_list, bio); + spin_unlock_irqrestore(&zwplug->lock, flags); + return; + } + + spin_unlock_irqrestore(&zwplug->lock, flags); + + bdev = bio->bi_bdev; + submit_bio_noacct_nocheck(bio); + + /* + * blk-mq devices will reuse the extra reference on the request queue + * usage counter we took when the BIO was plugged, but the submission + * path for BIO-based devices will not do that. So drop this extra + * reference here. + */ + if (bdev->bd_has_submit_bio) + blk_queue_exit(bdev->bd_disk->queue); +} + +static unsigned int blk_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + return 0; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. + */ + return UINT_MAX; + } +} + +static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, + unsigned int idx, void *data) +{ + struct blk_zone *zonep = data; + + *zonep = *zone; + return 0; +} + +static void disk_zone_wplug_handle_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + sector_t zone_start_sector = + bdev_zone_sectors(disk->part0) * zwplug->zone_no; + unsigned int noio_flag; + struct blk_zone zone; + unsigned long flags; + int ret; + + /* Get the current zone information from the device. */ + noio_flag = memalloc_noio_save(); + ret = disk->fops->report_zones(disk, zone_start_sector, 1, + blk_zone_wplug_report_zone_cb, &zone); + memalloc_noio_restore(noio_flag); + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * A zone reset or finish may have cleared the error already. In such + * case, do nothing as the report zones may have seen the "old" write + * pointer value before the reset/finish operation completed. + */ + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) + goto unlock; + + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; + + if (ret != 1) { + /* + * We failed to get the zone information, meaning that something + * is likely really wrong with the device. Abort all remaining + * plugged BIOs as otherwise we could endup waiting forever on + * plugged BIOs to complete if there is a queue freeze on-going. + */ + disk_zone_wplug_abort(zwplug); + goto unplug; + } + + /* Update the zone write pointer offset. */ + zwplug->wp_offset = blk_zone_wp_offset(&zone); + disk_zone_wplug_abort_unaligned(disk, zwplug); + + /* Restart BIO submission if we still have any BIO left. */ + if (!bio_list_empty(&zwplug->bio_list)) { + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + kblockd_schedule_work(&zwplug->bio_work); + goto unlock; + } + +unplug: + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + +unlock: + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +static void disk_zone_wplugs_work(struct work_struct *work) +{ + struct gendisk *disk = + container_of(work, struct gendisk, zone_wplugs_work); + struct blk_zone_wplug *zwplug; + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + + while (!list_empty(&disk->zone_wplugs_err_list)) { + zwplug = list_first_entry(&disk->zone_wplugs_err_list, + struct blk_zone_wplug, link); + list_del_init(&zwplug->link); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + disk_zone_wplug_handle_error(disk, zwplug); + disk_put_zone_wplug(zwplug); + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + } + + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + +static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) +{ + return 1U << disk->zone_wplugs_hash_bits; +} + +void disk_init_zone_resources(struct gendisk *disk) +{ + spin_lock_init(&disk->zone_wplugs_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_err_list); + INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); +} + +/* + * For the size of a disk zone write plug hash table, use the size of the + * zone write plug mempool, which is the maximum of the disk open zones and + * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, + * 9 bits. For a disk that has no limits, mempool size defaults to 128. + */ +#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 +#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 + +static int disk_alloc_zone_resources(struct gendisk *disk, + unsigned int pool_size) +{ + unsigned int i; + + disk->zone_wplugs_hash_bits = + min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); + + disk->zone_wplugs_hash = + kcalloc(disk_zone_wplugs_hash_size(disk), + sizeof(struct hlist_head), GFP_KERNEL); + if (!disk->zone_wplugs_hash) + return -ENOMEM; + + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); + + disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, + sizeof(struct blk_zone_wplug)); + if (!disk->zone_wplugs_pool) { + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; + return -ENOMEM; + } + + return 0; +} + +static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + unsigned int i; + + if (!disk->zone_wplugs_hash) + return; + + /* Free all the zone write plugs we have. */ + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + while (!hlist_empty(&disk->zone_wplugs_hash[i])) { + zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, + struct blk_zone_wplug, node); + atomic_inc(&zwplug->ref); + disk_remove_zone_wplug(disk, zwplug); + disk_put_zone_wplug(zwplug); + } + } + + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; +} + +void disk_free_zone_resources(struct gendisk *disk) +{ + cancel_work_sync(&disk->zone_wplugs_work); + + disk_destroy_zone_wplugs_hash_table(disk); + + /* + * Wait for the zone write plugs to be RCU-freed before + * destorying the mempool. + */ + rcu_barrier(); + + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; + kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; kfree(disk->seq_zones_wlock); disk->seq_zones_wlock = NULL; + + disk->zone_capacity = 0; + disk->nr_zones = 0; +} + +static int disk_revalidate_zone_resources(struct gendisk *disk, + unsigned int nr_zones) +{ + struct queue_limits *lim = &disk->queue->limits; + unsigned int pool_size; + + /* + * If the device has no limit on the maximum number of open and active + * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. + */ + pool_size = max(lim->max_open_zones, lim->max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + + if (!disk->zone_wplugs_hash) + return disk_alloc_zone_resources(disk, pool_size); + + /* Resize the zone write plug memory pool if needed. */ + if (disk->zone_wplugs_pool->min_nr != pool_size) + return mempool_resize(disk->zone_wplugs_pool, pool_size); + + return 0; } struct blk_revalidate_zone_args { @@ -453,6 +1514,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); sector_t zone_sectors = q->limits.chunk_sectors; + struct blk_zone_wplug *zwplug; + unsigned long flags; + unsigned int wp_offset; /* Check for bad zones and holes in the zone report */ if (zone->start != args->sector) { @@ -524,6 +1588,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, disk->disk_name); return -ENODEV; } + + /* + * We need to track the write pointer of all zones that are not + * empty nor full. So make sure we have a zone write plug for + * such zone. + */ + wp_offset = blk_zone_wp_offset(zone); + if (wp_offset && wp_offset < zone_sectors) { + zwplug = disk_get_and_lock_zone_wplug(disk, zone->start, + GFP_NOIO, &flags); + if (!zwplug) + return -ENOMEM; + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } + break; case BLK_ZONE_TYPE_SEQWRITE_PREF: default: @@ -560,7 +1640,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; - int ret; + int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; @@ -593,6 +1673,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk, args.disk = disk; args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); + ret = disk_revalidate_zone_resources(disk, args.nr_zones); + if (ret) { + memalloc_noio_restore(noio_flag); + return ret; + } ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { @@ -627,7 +1712,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - disk_free_zone_bitmaps(disk); + disk_free_zone_resources(disk); } blk_mq_unfreeze_queue(q); diff --git a/block/blk.h b/block/blk.h index bca50a9510c8..4df969f8fa28 100644 --- a/block/blk.h +++ b/block/blk.h @@ -415,7 +415,14 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, } #ifdef CONFIG_BLK_DEV_ZONED -void disk_free_zone_bitmaps(struct gendisk *disk); +void disk_init_zone_resources(struct gendisk *disk); +void disk_free_zone_resources(struct gendisk *disk); +static inline bool bio_zone_write_plugging(struct bio *bio) +{ + return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); +} +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_attempt_merge(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { @@ -423,22 +430,60 @@ static inline void blk_zone_update_request_bio(struct request *rq, * For zone append requests, the request sector indicates the location * at which the BIO data was written. Return this value to the BIO * issuer through the BIO iter sector. + * For plugged zone writes, we need the original BIO sector so + * that blk_zone_write_plug_bio_endio() can lookup the zone write plug. */ - if (req_op(rq) == REQ_OP_ZONE_APPEND) + if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) bio->bi_iter.bi_sector = rq->__sector; } +void blk_zone_write_plug_bio_endio(struct bio *bio); +static inline void blk_zone_bio_endio(struct bio *bio) +{ + /* + * For write BIOs to zoned devices, signal the completion of the BIO so + * that the next write BIO can be submitted by zone write plugging. + */ + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_bio_endio(bio); +} + +void blk_zone_write_plug_complete_request(struct request *rq); +static inline void blk_zone_complete_request(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_complete_request(rq); +} int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ -static inline void disk_free_zone_bitmaps(struct gendisk *disk) +static inline void disk_init_zone_resources(struct gendisk *disk) +{ +} +static inline void disk_free_zone_resources(struct gendisk *disk) +{ +} +static inline bool bio_zone_write_plugging(struct bio *bio) +{ + return false; +} +static inline void blk_zone_write_plug_bio_merged(struct bio *bio) +{ +} +static inline void blk_zone_write_plug_attempt_merge(struct request *rq) { } static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { } +static inline void blk_zone_bio_endio(struct bio *bio) +{ +} +static inline void blk_zone_complete_request(struct request *rq) +{ +} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { diff --git a/block/genhd.c b/block/genhd.c index bb29a68e1d67..eb893df56d51 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); - disk_free_zone_bitmaps(disk); + disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; @@ -1364,6 +1364,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (blkcg_init_disk(disk)) goto out_erase_part0; + disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d3d8fd8e229b..60090c8366fb 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -56,6 +56,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* The request completion needs to be signaled to zone write pluging. */ +#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) #define RQF_RESV ((__force req_flags_t)(1 << 23)) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cb1526ec44b5..ed45de07d2ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -234,7 +234,12 @@ struct bio { struct bvec_iter bi_iter; - blk_qc_t bi_cookie; + union { + /* for polled bios: */ + blk_qc_t bi_cookie; + /* for plugged zoned writes only: */ + unsigned int __bi_nr_segments; + }; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP @@ -305,6 +310,7 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_FLAG_LAST }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e81f714cca7..348b57ca0425 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -194,6 +194,12 @@ struct gendisk { unsigned int zone_capacity; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int zone_wplugs_hash_bits; + spinlock_t zone_wplugs_lock; + struct mempool_s *zone_wplugs_pool; + struct hlist_head *zone_wplugs_hash; + struct list_head zone_wplugs_err_list; + struct work_struct zone_wplugs_work; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -663,6 +669,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int bdev_nr_zones(struct block_device *bdev) { @@ -690,6 +697,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { return 0; } +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_depth(struct request_queue *q) -- cgit v1.2.3 From 843283e96e5a3d8379579ac13ce9cbf75522ffde Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:08 +0900 Subject: block: Fake max open zones limit when there is no limit For a zoned block device that has no limit on the number of open zones and no limit on the number of active zones, the zone write plug mempool is created with a size of 128 zone write plugs. For such case, set the device max_open_zones queue limit to this value to indicate to the user the potential performance penalty that may happen when writing simultaneously to more zones than the mempool size. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-9-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index fefcebd70445..4b21a1ec00d4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1503,6 +1503,38 @@ struct blk_revalidate_zone_args { sector_t sector; }; +/* + * Update the disk zone resources information and device queue limits. + * The disk queue is frozen when this is executed. + */ +static int disk_update_zone_resources(struct gendisk *disk, + struct blk_revalidate_zone_args *args) +{ + struct request_queue *q = disk->queue; + struct queue_limits lim; + + disk->nr_zones = args->nr_zones; + disk->zone_capacity = args->zone_capacity; + swap(disk->seq_zones_wlock, args->seq_zones_wlock); + swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); + + /* + * If the device has no limit on the maximum number of open and active + * zones, set its max open zone limit to the mempool size to indicate + * to the user that there is a potential performance impact due to + * dynamic zone write plug allocation when simultaneously writing to + * more zones than the size of the mempool. + */ + if (disk->zone_wplugs_pool) { + lim = queue_limits_start_update(q); + if (!lim.max_open_zones && !lim.max_active_zones) + lim.max_open_zones = disk->zone_wplugs_pool->min_nr; + return queue_limits_commit_update(q, &lim); + } + + return 0; +} + /* * Helper function to check the validity of zones of a zoned block device. */ @@ -1703,17 +1735,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk, */ blk_mq_freeze_queue(q); if (ret > 0) { - disk->nr_zones = args.nr_zones; - disk->zone_capacity = args.zone_capacity; - swap(disk->seq_zones_wlock, args.seq_zones_wlock); - swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); + ret = disk_update_zone_resources(disk, &args); if (update_driver_data) update_driver_data(disk); - ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - disk_free_zone_resources(disk); } + if (ret) + disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q); kfree(args.seq_zones_wlock); -- cgit v1.2.3 From ccdbf0aad2523ca133cceb22ce0f8306730e7ac3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:09 +0900 Subject: block: Allow zero value of max_zone_append_sectors queue limit In preparation for adding a generic zone append emulation using zone write plugging, allow device drivers supporting zoned block device to set a the max_zone_append_sectors queue limit of a device to 0 to indicate the lack of native support for zone append operations and that the block layer should emulate these operations using regular write operations. blk_queue_max_zone_append_sectors() is modified to allow passing 0 as the max_zone_append_sectors argument. The function queue_max_zone_append_sectors() is also modified to ensure that the minimum of the max_hw_sectors and chunk_sectors limit is used whenever the max_zone_append_sectors limit is 0. This minimum is consistent with the value set for the max_zone_append_sectors limit by the function blk_validate_zoned_limits() when limits for a queue are validated. The helper functions queue_emulates_zone_append() and bdev_emulates_zone_append() are added to test if a queue (or block device) emulates zone append operations. In order for blk_revalidate_disk_zones() to accept zoned block devices relying on zone append emulation, the direct check to the max_zone_append_sectors queue limit of the disk is replaced by a check using the value returned by queue_max_zone_append_sectors(). Similarly, queue_zone_append_max_show() is modified to use the same accessor so that the sysfs attribute advertizes the non-zero limit that will be used, regardless if it is for native or emulated commands. For stacking drivers, a top device should not need to care if the underlying devices have native or emulated zone append operations. blk_stack_limits() is thus modified to set the top device max_zone_append_sectors limit using the new accessor queue_limits_max_zone_append_sectors(). queue_max_zone_append_sectors() is modified to use this function as well. Stacking drivers that require zone append emulation, e.g. dm-crypt, can still request this feature by calling blk_queue_max_zone_append_sectors() with a 0 limit. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-10-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-settings.c | 30 +++++++++++++++++++----------- block/blk-sysfs.c | 2 +- block/blk-zoned.c | 2 +- include/linux/blkdev.h | 23 ++++++++++++++++++++--- 5 files changed, 42 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a16b5abdbbf5..3bf28149e104 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -602,7 +602,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > q->limits.max_zone_append_sectors) + if (nr_sectors > queue_max_zone_append_sectors(q)) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; diff --git a/block/blk-settings.c b/block/blk-settings.c index cdbaef159c4b..c0197e1e7485 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -412,24 +412,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); * blk_queue_max_zone_append_sectors - set max sectors for a single zone append * @q: the request queue for the device * @max_zone_append_sectors: maximum number of sectors to write per command + * + * Sets the maximum number of sectors allowed for zone append commands. If + * Specifying 0 for @max_zone_append_sectors indicates that the queue does + * not natively support zone append operations and that the block layer must + * emulate these operations using regular writes. **/ void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors) { - unsigned int max_sectors; + unsigned int max_sectors = 0; if (WARN_ON(!blk_queue_is_zoned(q))) return; - max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); + if (max_zone_append_sectors) { + max_sectors = min(q->limits.max_hw_sectors, + max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); - /* - * Signal eventual driver bugs resulting in the max_zone_append sectors limit - * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, - * or the max_hw_sectors limit not set. - */ - WARN_ON(!max_sectors); + /* + * Signal eventual driver bugs resulting in the max_zone_append + * sectors limit being 0 due to the chunk_sectors limit (zone + * size) not set or the max_hw_sectors limit not set. + */ + WARN_ON_ONCE(!max_sectors); + } q->limits.max_zone_append_sectors = max_sectors; } @@ -756,8 +764,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(t->max_zone_append_sectors, - b->max_zone_append_sectors); + t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), + queue_limits_max_zone_append_sectors(b)); t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8c8f69d8ba48..e3ed5a921aff 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q, static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { - unsigned long long max_sectors = q->limits.max_zone_append_sectors; + unsigned long long max_sectors = queue_max_zone_append_sectors(q); return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 4b21a1ec00d4..fcc1284b7c19 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1692,7 +1692,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return -ENODEV; } - if (!q->limits.max_zone_append_sectors) { + if (!queue_max_zone_append_sectors(q)) { pr_warn("%s: Invalid 0 maximum zone append limit\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 348b57ca0425..46613bf6a402 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1173,12 +1173,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) { + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - const struct queue_limits *l = &q->limits; + return min_not_zero(l->max_zone_append_sectors, max_sectors); +} + +static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) +{ + if (!blk_queue_is_zoned(q)) + return 0; - return min(l->max_zone_append_sectors, l->max_sectors); + return queue_limits_max_zone_append_sectors(&q->limits); +} + +static inline bool queue_emulates_zone_append(struct request_queue *q) +{ + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; +} + +static inline bool bdev_emulates_zone_append(struct block_device *bdev) +{ + return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int -- cgit v1.2.3 From 9b1ce7f0c6f82e241196febabddba5fab66c8f05 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:10 +0900 Subject: block: Implement zone append emulation Given that zone write plugging manages all writes to zones of a zoned block device and tracks the write pointer position of all zones that are not full nor empty, emulating zone append operations using regular writes can be implemented generically, without relying on the underlying device driver to implement such emulation. This is needed for devices that do not natively support the zone append command (e.g. SMR hard-disks). A device may request zone append emulation by setting its max_zone_append_sectors queue limit to 0. For such device, the function blk_zone_wplug_prepare_bio() changes zone append BIOs into non-mergeable regular write BIOs. Modified zone append BIOs are flagged with the new BIO flag BIO_EMULATES_ZONE_APPEND. This flag is checked on completion of the BIO in blk_zone_write_plug_bio_endio() to restore the original REQ_OP_ZONE_APPEND operation code of the BIO. The block layer internal inline helper function bio_is_zone_append() is added to test if a BIO is either a native zone append operation (REQ_OP_ZONE_APPEND operation code) or if it is flagged with BIO_EMULATES_ZONE_APPEND. Given that both native and emulated zone append BIO completion handling should be similar, The functions blk_update_request() and blk_zone_complete_request_bio() are modified to use bio_is_zone_append() to execute blk_zone_update_request_bio() for both native and emulated zone append operations. This commit contains contributions from Christoph Hellwig . Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-11-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- block/blk-zoned.c | 64 +++++++++++++++++++++++++++++++++++++++-------- block/blk.h | 14 +++++++++-- include/linux/blk_types.h | 1 + 4 files changed, 67 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 48eb7dd049d1..6f9cc1c4d4fb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -906,8 +906,7 @@ bool blk_update_request(struct request *req, blk_status_t error, if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; - } else if (req_op(req) == REQ_OP_ZONE_APPEND && - error == BLK_STS_OK) { + } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written diff --git a/block/blk-zoned.c b/block/blk-zoned.c index fcc1284b7c19..a60ac5b3e637 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -689,7 +689,8 @@ static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, while ((bio = bio_list_pop(&zwplug->bio_list))) { if (wp_offset >= zone_capacity || - bio_offset_from_zone_start(bio) != wp_offset) { + (bio_op(bio) != REQ_OP_ZONE_APPEND && + bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(bio); disk_put_zone_wplug(zwplug); continue; @@ -951,7 +952,8 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk, /* * Check and prepare a BIO for submission by incrementing the write pointer - * offset of its zone write plug. + * offset of its zone write plug and changing zone append operations into + * regular write when zone append emulation is needed. */ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, struct bio *bio) @@ -966,13 +968,30 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, if (zwplug->wp_offset >= disk->zone_capacity) goto err; - /* - * Check for non-sequential writes early because we avoid a - * whole lot of error handling trouble if we don't send it off - * to the driver. - */ - if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) - goto err; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + /* + * Use a regular write starting at the current write pointer. + * Similarly to native zone append operations, do not allow + * merging. + */ + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; + bio->bi_iter.bi_sector += zwplug->wp_offset; + + /* + * Remember that this BIO is in fact a zone append operation + * so that we can restore its operation code on completion. + */ + bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); + } else { + /* + * Check for non-sequential writes early because we avoid a + * whole lot of error handling trouble if we don't send it off + * to the driver. + */ + if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) + goto err; + } /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); @@ -1008,8 +1027,14 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) } /* Conventional zones do not need write plugging. */ - if (disk_zone_is_conv(disk, sector)) + if (disk_zone_is_conv(disk, sector)) { + /* Zone append to conventional zones is not allowed. */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + bio_io_error(bio); + return true; + } return false; + } if (bio->bi_opf & REQ_NOWAIT) gfp_mask = GFP_NOWAIT; @@ -1057,7 +1082,8 @@ plug: * @bio: The BIO being submitted * @nr_segs: The number of physical segments of @bio * - * Handle write and write zeroes operations using zone write plugging. + * Handle write, write zeroes and zone append operations requiring emulation + * using zone write plugging. * * Return true whenever @bio execution needs to be delayed through the zone * write plug. Otherwise, return false to let the submission path process @@ -1096,6 +1122,9 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) * machinery operates at the request level, below the plug, and * completion of the flush sequence will go through the regular BIO * completion, which will handle zone write plugging. + * Zone append operations for devices that requested emulation must + * also be plugged so that these BIOs can be changed into regular + * write BIOs. * Zone reset, reset all and finish commands need special treatment * to correctly track the write pointer offset of zones. These commands * are not plugged as we do not need serialization with write @@ -1103,6 +1132,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) * and finish commands when write operations are in flight. */ switch (bio_op(bio)) { + case REQ_OP_ZONE_APPEND: + if (!bdev_emulates_zone_append(bdev)) + return false; + fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: return blk_zone_wplug_handle_write(bio, nr_segs); @@ -1171,6 +1204,15 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) /* Make sure we do not see this BIO again by clearing the plug flag. */ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + /* + * If this is a regular write emulating a zone append operation, + * restore the original operation code. + */ + if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + } + /* * If the BIO failed, mark the plug as having an error to trigger * recovery. diff --git a/block/blk.h b/block/blk.h index 4df969f8fa28..1140c4a0be03 100644 --- a/block/blk.h +++ b/block/blk.h @@ -421,6 +421,11 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } +static inline bool bio_is_zone_append(struct bio *bio) +{ + return bio_op(bio) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); +} void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_attempt_merge(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, @@ -430,8 +435,9 @@ static inline void blk_zone_update_request_bio(struct request *rq, * For zone append requests, the request sector indicates the location * at which the BIO data was written. Return this value to the BIO * issuer through the BIO iter sector. - * For plugged zone writes, we need the original BIO sector so - * that blk_zone_write_plug_bio_endio() can lookup the zone write plug. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. */ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) bio->bi_iter.bi_sector = rq->__sector; @@ -468,6 +474,10 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } +static inline bool bio_is_zone_append(struct bio *bio) +{ + return false; +} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ed45de07d2ef..29b3170431e7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,7 @@ enum { BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ + BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST }; -- cgit v1.2.3 From 946dd71ed87dfa8d72f1404f906e1ae413a62d0f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:11 +0900 Subject: block: Allow BIO-based drivers to use blk_revalidate_disk_zones() In preparation for allowing BIO based device drivers to use zone write plugging and its zone append emulation, allow these drivers to call blk_revalidate_disk_zones() so that all zone resources necessary to zone write plugging can be initialized. To do so, remove the check in blk_revalidate_disk_zones() restricting the use of this function to mq request-based drivers to allow also BIO-based drivers to use it. This is safe to do as long as the BIO-based block device queue is already setup and usable, as it should, and can be safely frozen. The helper function disk_need_zone_resources() is added to control the allocation and initialization of the zone write plug hash table and of the conventional zone bitmap only for mq devices and for BIO-based devices that require zone append emulation. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-12-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a60ac5b3e637..da0fc7e2d00a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1512,12 +1512,28 @@ void disk_free_zone_resources(struct gendisk *disk) disk->nr_zones = 0; } +static inline bool disk_need_zone_resources(struct gendisk *disk) +{ + /* + * All mq zoned devices need zone resources so that the block layer + * can automatically handle write BIO plugging. BIO-based device drivers + * (e.g. DM devices) are normally responsible for handling zone write + * ordering and do not need zone resources, unless the driver requires + * zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + static int disk_revalidate_zone_resources(struct gendisk *disk, unsigned int nr_zones) { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + if (!disk_need_zone_resources(disk)) + return 0; + /* * If the device has no limit on the maximum number of open and active * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. @@ -1635,6 +1651,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, disk->disk_name); return -ENODEV; } + + if (!disk_need_zone_resources(disk)) + break; if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -1666,10 +1685,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /* * We need to track the write pointer of all zones that are not * empty nor full. So make sure we have a zone write plug for - * such zone. + * such zone if the device has a zone write plug hash table. */ wp_offset = blk_zone_wp_offset(zone); - if (wp_offset && wp_offset < zone_sectors) { + if (disk->zone_wplugs_hash && + wp_offset && wp_offset < zone_sectors) { zwplug = disk_get_and_lock_zone_wplug(disk, zone->start, GFP_NOIO, &flags); if (!zwplug) @@ -1700,8 +1720,8 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * be called within the disk ->revalidate method for blk-mq based drivers. * Before calling this function, the device driver must already have set the * device zone size (chunk_sector limit) and the max zone append limit. - * For BIO based drivers, this function cannot be used. BIO based device drivers - * only need to set disk->nr_zones so that the sysfs exposed value is correct. + * BIO based drivers can also use this function as long as the device queue + * can be safely frozen. * If the @update_driver_data callback function is not NULL, the callback is * executed with the device request queue frozen after all zones have been * checked. @@ -1718,8 +1738,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk, if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; - if (WARN_ON_ONCE(!queue_is_mq(q))) - return -EIO; if (!capacity) return -ENODEV; -- cgit v1.2.3 From 63b5385e781417e73bda3fd652c2199826afda6e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:19 +0900 Subject: block: Remove BLK_STS_ZONE_RESOURCE The zone append emulation of the scsi disk driver was the only driver using BLK_STS_ZONE_RESOURCE. With this code removed, BLK_STS_ZONE_RESOURCE is now unused. Remove this macro definition and simplify blk_mq_dispatch_rq_list() where this status code was handled. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-20-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 26 -------------------------- drivers/scsi/scsi_lib.c | 1 - include/linux/blk_types.h | 20 ++++---------------- 3 files changed, 4 insertions(+), 43 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 6f9cc1c4d4fb..9f2d9970eeba 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1921,19 +1921,6 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } -static void blk_mq_handle_zone_resource(struct request *rq, - struct list_head *zone_list) -{ - /* - * If we end up here it is because we cannot dispatch a request to a - * specific zone due to LLD level zone-write locking or other zone - * related resource not being available. In this case, set the request - * aside in zone_list for retrying it later. - */ - list_add(&rq->queuelist, zone_list); - __blk_mq_requeue_request(rq); -} - enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, @@ -2019,7 +2006,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; - LIST_HEAD(zone_list); bool needs_resource = false; if (list_empty(list)) @@ -2061,23 +2047,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; - case BLK_STS_ZONE_RESOURCE: - /* - * Move the request to zone_list and keep going through - * the dispatch list to find more requests the drive can - * accept. - */ - blk_mq_handle_zone_resource(rq, &zone_list); - needs_resource = true; - break; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: - if (!list_empty(&zone_list)) - list_splice_tail_init(&zone_list, list); - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2e28e2360c85..9ca96116bd33 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1870,7 +1870,6 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: - case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 29b3170431e7..ffe0c112b128 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -136,18 +136,6 @@ typedef u16 blk_short_t; */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) -/* - * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone - * related resources are unavailable, but the driver can guarantee the queue - * will be rerun in the future once the resources become available again. - * - * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references - * a zone specific resource and IO to a different zone on the same device could - * still be served. Examples of that are zones that are write-locked, but a read - * to the same zone could be served. - */ -#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) - /* * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources @@ -155,7 +143,7 @@ typedef u16 blk_short_t; * after the number of open zones decreases below the device's limits, which is * reported in the request_queue's max_open_zones. */ -#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) +#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) /* * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion @@ -164,20 +152,20 @@ typedef u16 blk_short_t; * after the number of active zones decreases below the device's limits, which * is reported in the request_queue's max_active_zones. */ -#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) +#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) /* * BLK_STS_OFFLINE is returned from the driver when the target device is offline * or is being taken offline. This could help differentiate the case where a * device is intentionally being shut down from a real I/O error. */ -#define BLK_STS_OFFLINE ((__force blk_status_t)17) +#define BLK_STS_OFFLINE ((__force blk_status_t)16) /* * BLK_STS_DURATION_LIMIT is returned from the driver when the target device * aborted the command because it exceeded one of its Command Duration Limits. */ -#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18) +#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) /** * blk_path_error - returns true if error may be path related -- cgit v1.2.3 From 9b3c08b90fc212de58c34621d83e74977170b2cd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:20 +0900 Subject: block: Simplify blk_revalidate_disk_zones() interface The only user of blk_revalidate_disk_zones() second argument was the SCSI disk driver (sd). Now that this driver does not require this update_driver_data argument, remove it to simplify the interface of blk_revalidate_disk_zones(). Also update the function kdoc comment to be more accurate (i.e. there is no gendisk ->revalidate method). Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-21-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 21 +++++++-------------- drivers/block/null_blk/zoned.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/md/dm-zone.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/scsi/sd_zbc.c | 2 +- include/linux/blkdev.h | 3 +-- 8 files changed, 14 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index da0fc7e2d00a..e46d23ad2fa9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1713,21 +1713,17 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /** * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * @disk: Target disk - * @update_driver_data: Callback to update driver data on the frozen disk * - * Helper function for low-level device drivers to check and (re) allocate and - * initialize a disk request queue zone bitmaps. This functions should normally - * be called within the disk ->revalidate method for blk-mq based drivers. + * Helper function for low-level device drivers to check, (re) allocate and + * initialize resources used for managing zoned disks. This function should + * normally be called by blk-mq based drivers when a zoned gendisk is probed + * and when the zone configuration of the gendisk changes (e.g. after a format). * Before calling this function, the device driver must already have set the * device zone size (chunk_sector limit) and the max zone append limit. * BIO based drivers can also use this function as long as the device queue * can be safely frozen. - * If the @update_driver_data callback function is not NULL, the callback is - * executed with the device request queue frozen after all zones have been - * checked. */ -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)) +int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; sector_t zone_sectors = q->limits.chunk_sectors; @@ -1794,13 +1790,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk, * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret > 0) { + if (ret > 0) ret = disk_update_zone_resources(disk, &args); - if (update_driver_data) - update_driver_data(disk); - } else { + else pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - } if (ret) disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 0b2af273adaf..4ddd84752557 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -177,7 +177,7 @@ int null_register_zoned_dev(struct nullb *nullb) disk->disk_name, queue_emulates_zone_append(q) ? "emulated" : "native"); - return blk_revalidate_disk_zones(disk, NULL); + return blk_revalidate_disk_zones(disk); } void null_free_zoned_dev(struct nullb_device *dev) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ab6af84e327c..851c78913de2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 42dea7601d87..c1af0a7d56c8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev) */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); - err = blk_revalidate_disk_zones(vblk->disk, NULL); + err = blk_revalidate_disk_zones(vblk->disk); if (err) goto out_cleanup_disk; } diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 34769f3e3175..d17ae4486a6a 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -169,7 +169,7 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) * our table for dm_blk_report_zones() to use directly. */ md->zone_revalidate_map = t; - ret = blk_revalidate_disk_zones(disk, NULL); + ret = blk_revalidate_disk_zones(disk); md->zone_revalidate_map = NULL; if (ret) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 943d72bdd794..c9955ecd1790 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2150,7 +2150,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = blk_revalidate_disk_zones(ns->disk, NULL); + ret = blk_revalidate_disk_zones(ns->disk); if (ret && !nvme_first_scan(ns->disk)) goto out; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index d0ead9858954..806036e48abe 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -572,7 +572,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) blk_queue_max_zone_append_sectors(q, 0); flags = memalloc_noio_save(); - ret = blk_revalidate_disk_zones(disk, NULL); + ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); if (ret) { sdkp->zone_info = (struct zoned_disk_info){ }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46613bf6a402..fbc6860b3622 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -336,8 +336,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); +int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes -- cgit v1.2.3 From fde02699c242e88a71286677d27cc890a959b67f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:21 +0900 Subject: block: mq-deadline: Remove support for zone write locking With the block layer generic plugging of write operations for zoned block devices, mq-deadline, or any other scheduler, can only ever see at most one write operation per zone at any time. There is thus no sequentiality requirements for these writes and thus no need to tightly control the dispatching of write requests using zone write locking. Remove all the code that implement this control in the mq-deadline scheduler and remove advertizing support for the ELEVATOR_F_ZBD_SEQ_WRITE elevator feature. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-22-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 176 ++-------------------------------------------------- 1 file changed, 6 insertions(+), 170 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..dce8d746b5bd 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -102,7 +102,6 @@ struct deadline_data { int prio_aging_expire; spinlock_t lock; - spinlock_t zone_lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ @@ -157,8 +156,7 @@ deadline_latter_request(struct request *rq) } /* - * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, - * return the first request after the start of the zone containing @pos. + * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) @@ -170,14 +168,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, return NULL; rq = rb_entry_rq(node); - /* - * A zoned write may have been requeued with a starting position that - * is below that of the most recently dispatched request. Hence, for - * zoned writes, start searching from the start of a zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) - pos = round_down(pos, rq->q->limits.chunk_sectors); - while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -308,36 +298,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } -/* - * Check if rq has a sequential request preceding it. - */ -static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) -{ - struct request *prev = deadline_earlier_request(rq); - - if (!prev) - return false; - - return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); -} - -/* - * Skip all write requests that are sequential from @rq, even if we cross - * a zone boundary. - */ -static struct request *deadline_skip_seq_writes(struct deadline_data *dd, - struct request *rq) -{ - sector_t pos = blk_rq_pos(rq); - - do { - pos += blk_rq_sectors(rq); - rq = deadline_latter_request(rq); - } while (rq && blk_rq_pos(rq) == pos); - - return rq; -} - /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. @@ -346,40 +306,10 @@ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq, *rb_rq, *next; - unsigned long flags; - if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], - queuelist) { - /* Check whether a prior request exists for the same zone. */ - rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); - if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) - rq = rb_rq; - if (blk_req_can_dispatch_to_zone(rq) && - (blk_queue_nonrot(rq->q) || - !deadline_is_seq_write(dd, rq))) - goto out; - } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* @@ -390,36 +320,8 @@ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq; - unsigned long flags; - - rq = deadline_from_pos(per_prio, data_dir, - per_prio->latest_pos[data_dir]); - if (!rq) - return NULL; - - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; - if (blk_queue_nonrot(rq->q)) - rq = deadline_latter_request(rq); - else - rq = deadline_skip_seq_writes(dd, rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); } /* @@ -525,10 +427,6 @@ dispatch_find_request: rq = next_rq; } - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ if (!rq) return NULL; @@ -549,10 +447,6 @@ done: prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; - /* - * If the request needs its target zone locked, do it. - */ - blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } @@ -722,7 +616,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -804,12 +697,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, lockdep_assert_held(&dd->lock); - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) { @@ -841,18 +728,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; insert_before = &per_prio->fifo_list[data_dir]; -#ifdef CONFIG_BLK_DEV_ZONED - /* - * Insert zoned writes such that requests are sorted by - * position per zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) { - struct request *rq2 = deadline_latter_request(rq); - - if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) - insert_before = &rq2->queuelist; - } -#endif list_add_tail(&rq->queuelist, insert_before); } } @@ -887,33 +762,8 @@ static void dd_prepare_request(struct request *rq) rq->elv.priv[0] = NULL; } -static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio p; - - for (p = 0; p <= DD_PRIO_MAX; p++) - if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) - return true; - - return false; -} - /* * Callback from inside blk_mq_free_request(). - * - * For zoned block devices, write unlock the target zone of - * completed write requests. Do this while holding the zone lock - * spinlock so that the zone is never unlocked while deadline_fifo_request() - * or deadline_next_request() are executing. This function is called for - * all requests, whether or not these requests complete successfully. - * - * For a zoned block device, __dd_dispatch_request() may have stopped - * dispatching requests if all the queued requests are write requests directed - * at zones that are already locked due to on-going write requests. To ensure - * write request dispatch progress in this case, mark the queue as needing a - * restart to ensure that the queue is run again after completion of the - * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { @@ -928,21 +778,8 @@ static void dd_finish_request(struct request *rq) * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ - if (!rq->elv.priv[0]) - return; - - atomic_inc(&per_prio->stats.completed); - - if (blk_queue_is_zoned(q)) { - unsigned long flags; - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); - spin_unlock_irqrestore(&dd->zone_lock, flags); - - if (dd_has_write_work(rq->mq_hctx)) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - } + if (rq->elv.priv[0]) + atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) @@ -1266,7 +1103,6 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); -- cgit v1.2.3 From e4eb37cc0f3ed8971c50dddfbeb35a799e5b504e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:22 +0900 Subject: block: Remove elevator required features The only elevator feature ever implemented is ELEVATOR_F_ZBD_SEQ_WRITE for signaling that a scheduler implements zone write locking to tightly control the dispatching order of write operations to zoned block devices. With the removal of zone write locking support in mq-deadline and the reliance of all block device drivers on the block layer zone write plugging to control ordering of write operations to zones, the elevator feature ELEVATOR_F_ZBD_SEQ_WRITE is completely unused. Remove it, and also remove the now unused code for filtering the possible schedulers for a block device based on required features. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-23-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ---------------- block/elevator.c | 46 +++++----------------------------------------- block/elevator.h | 1 - include/linux/blkdev.h | 10 ---------- 4 files changed, 5 insertions(+), 68 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index c0197e1e7485..715f4b6356c4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -1052,22 +1052,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); -/** - * blk_queue_required_elevator_features - Set a queue required elevator features - * @q: the request queue for the target device - * @features: Required elevator features OR'ed together - * - * Tell the block layer that for the device controlled through @q, only the - * only elevators that can be used are those that implement at least the set of - * features specified by @features. - */ -void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features) -{ - q->required_elevator_features = features; -} -EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); - /** * blk_queue_can_use_dma_map_merging - configure queue for merging segments. * @q: the request queue for the device diff --git a/block/elevator.c b/block/elevator.c index 5ff093cb3cf8..f64ebd726e58 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(struct request_queue *q, - const struct elevator_type *e) -{ - return (q->required_elevator_features & e->elevator_features) == - q->required_elevator_features; -} - /** * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test @@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q, spin_lock(&elv_list_lock); e = __elevator_find(name); - if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) + if (e && (!elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; @@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) } /* - * Get the first elevator providing the features required by the request queue. - * Default to "none" if no matching elevator is found. - */ -static struct elevator_type *elevator_get_by_features(struct request_queue *q) -{ - struct elevator_type *e, *found = NULL; - - spin_lock(&elv_list_lock); - - list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(q, e)) { - found = e; - break; - } - } - - if (found && !elevator_tryget(found)) - found = NULL; - - spin_unlock(&elv_list_lock); - return found; -} - -/* - * For a device queue that has no required features, use the default elevator - * settings. Otherwise, use the first elevator available matching the required - * features. If no suitable elevator is find or if the chosen elevator - * initialization fails, fall back to the "none" elevator (no elevator). + * Use the default elevator settings. If the chosen elevator initialization + * fails, fall back to the "none" elevator (no elevator). */ void elevator_init_mq(struct request_queue *q) { @@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q) if (unlikely(q->elevator)) return; - if (!q->required_elevator_features) - e = elevator_get_default(q); - else - e = elevator_get_by_features(q); + e = elevator_get_default(q); if (!e) return; @@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) list_for_each_entry(e, &elv_list, list) { if (e == cur) len += sprintf(name+len, "[%s] ", e->elevator_name); - else if (elv_support_features(q, e)) + else len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); diff --git a/block/elevator.h b/block/elevator.h index 7ca3d7b6ed82..e9a050a96e53 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -74,7 +74,6 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; - const unsigned int elevator_features; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fbc6860b3622..017e9d064177 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -453,8 +453,6 @@ struct request_queue { atomic_t nr_active_requests_shared_tags; - unsigned int required_elevator_features; - struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@ -958,14 +956,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); -/* - * Elevator features for blk_queue_required_elevator_features: - */ -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) - -extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); -- cgit v1.2.3 From bca150f0d4edbf02002efa3309bb8e8c9d6596c9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:23 +0900 Subject: block: Do not check zone type in blk_check_zone_append() Zone append operations are only allowed to target sequential write required zones. blk_check_zone_append() uses bio_zone_is_seq() to check this. However, this check is not necessary because: 1) For NVMe ZNS namespace devices, only sequential write required zones exist, making the zone type check useless. 2) For null_blk, the driver will fail the request anyway, thus notifying the user that a conventional zone was targeted. 3) For all other zoned devices, zone append is now emulated using zone write plugging, which checks that a zone append operation does not target a conventional zone. In preparation for the removal of zone write locking and its conventional zone bitmap (used by bio_zone_is_seq()), remove the bio_zone_is_seq() call from blk_check_zone_append(). Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-24-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3bf28149e104..e1a5344c2257 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -589,8 +589,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || - !bio_zone_is_seq(bio)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* -- cgit v1.2.3 From d9f1439a30d607f7bd06494ea2a63018b7d46380 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:24 +0900 Subject: block: Move zone related debugfs attribute to blk-zoned.c block/blk-mq-debugfs-zone.c contains a single debugfs attribute function. Defining this outside of block/blk-zoned.c does not really help in any way, so move this zone related debugfs attribute to block/blk-zoned.c and delete block/blk-mq-debugfs-zone.c. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-25-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/Kconfig | 4 ---- block/Makefile | 1 - block/blk-mq-debugfs-zoned.c | 22 ---------------------- block/blk-mq-debugfs.h | 2 +- block/blk-zoned.c | 20 ++++++++++++++++++++ 5 files changed, 21 insertions(+), 28 deletions(-) delete mode 100644 block/blk-mq-debugfs-zoned.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 1de4682d48cc..9f647149fbee 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -198,10 +198,6 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. -config BLK_DEBUG_FS_ZONED - bool - default BLK_DEBUG_FS && BLK_DEV_ZONED - config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" depends on KEYS diff --git a/block/Makefile b/block/Makefile index 46ada9dc8bbf..168150b9c510 100644 --- a/block/Makefile +++ b/block/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o -obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c deleted file mode 100644 index a77b099c34b7..000000000000 --- a/block/blk-mq-debugfs-zoned.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2017 Western Digital Corporation or its affiliates. - */ - -#include -#include "blk-mq-debugfs.h" - -int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->disk->seq_zones_wlock) - return 0; - - for (i = 0; i < q->disk->nr_zones; i++) - if (test_bit(i, q->disk->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 9c7d4b6117d4..3ebe2c29b624 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -83,7 +83,7 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) } #endif -#ifdef CONFIG_BLK_DEBUG_FS_ZONED +#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) int queue_zone_wlock_show(void *data, struct seq_file *m); #else static inline int queue_zone_wlock_show(void *data, struct seq_file *m) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index e46d23ad2fa9..a06d7f7a54c7 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -23,6 +23,7 @@ #include "blk.h" #include "blk-mq-sched.h" +#include "blk-mq-debugfs.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -1804,3 +1805,22 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +#ifdef CONFIG_BLK_DEBUG_FS + +int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int i; + + if (!q->disk->seq_zones_wlock) + return 0; + + for (i = 0; i < q->disk->nr_zones; i++) + if (test_bit(i, q->disk->seq_zones_wlock)) + seq_printf(m, "%u\n", i); + + return 0; +} + +#endif -- cgit v1.2.3 From a98b05b02f0f1f9f4a504564070af208b70214d0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:25 +0900 Subject: block: Replace zone_wlock debugfs entry with zone_wplugs entry In preparation to completely remove zone write locking, replace the "zone_wlock" mq-debugfs entry that was listing zones that are write-locked with the zone_wplugs entry which lists the zones that currently have a write plug allocated. The write plug information provided is: the zone number, the zone write plug flags, the zone write plug write pointer offset and the number of BIOs currently waiting for execution in the zone write plug BIO list. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-26-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- block/blk-mq-debugfs.h | 4 ++-- block/blk-zoned.c | 31 ++++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 94668e72ab09..ca1f2b9422d5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, - { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, { }, }; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 3ebe2c29b624..c80e453e3014 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -84,9 +84,9 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) #endif #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) -int queue_zone_wlock_show(void *data, struct seq_file *m); +int queue_zone_wplugs_show(void *data, struct seq_file *m); #else -static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +static inline int queue_zone_wplugs_show(void *data, struct seq_file *m) { return 0; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a06d7f7a54c7..44699b431ad0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1808,17 +1808,34 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); #ifdef CONFIG_BLK_DEBUG_FS -int queue_zone_wlock_show(void *data, struct seq_file *m) +int queue_zone_wplugs_show(void *data, struct seq_file *m) { struct request_queue *q = data; - unsigned int i; + struct gendisk *disk = q->disk; + struct blk_zone_wplug *zwplug; + unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_zone_no, zwp_ref; + unsigned int zwp_bio_list_size, i; + unsigned long flags; - if (!q->disk->seq_zones_wlock) - return 0; + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + hlist_for_each_entry_rcu(zwplug, + &disk->zone_wplugs_hash[i], node) { + spin_lock_irqsave(&zwplug->lock, flags); + zwp_zone_no = zwplug->zone_no; + zwp_flags = zwplug->flags; + zwp_ref = atomic_read(&zwplug->ref); + zwp_wp_offset = zwplug->wp_offset; + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); + spin_unlock_irqrestore(&zwplug->lock, flags); - for (i = 0; i < q->disk->nr_zones; i++) - if (test_bit(i, q->disk->seq_zones_wlock)) - seq_printf(m, "%u\n", i); + seq_printf(m, "%u 0x%x %u %u %u\n", + zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); + } + } + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 02ccd7c360b1692da164842f211d41fab7d83adc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:26 +0900 Subject: block: Remove zone write locking Zone write locking is now unused and replaced with zone write plugging. Remove all code that was implementing zone write locking, that is, the various helper functions controlling request zone write locking and the gendisk attached zone bitmaps. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-27-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-zoned.c | 66 ++----------------------------------- include/linux/blk-mq.h | 83 ----------------------------------------------- include/linux/blk_types.h | 1 - include/linux/blkdev.h | 35 +++----------------- 5 files changed, 7 insertions(+), 179 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ca1f2b9422d5..770c0c2b72fa 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -256,7 +256,6 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), - RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 44699b431ad0..3befebe6b319 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -115,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -/* - * Return true if a request is a write requests that needs zone write locking. - */ -bool blk_req_needs_zone_write_lock(struct request *rq) -{ - if (!rq->q->disk->seq_zones_wlock) - return false; - - return blk_rq_is_seq_zoned_write(rq); -} -EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); - -bool blk_req_zone_write_trylock(struct request *rq) -{ - unsigned int zno = blk_rq_zone_no(rq); - - if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) - return false; - - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; - - return true; -} -EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); - -void __blk_req_zone_write_lock(struct request *rq) -{ - if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock))) - return; - - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); - -void __blk_req_zone_write_unlock(struct request *rq) -{ - rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; - if (rq->q->disk->seq_zones_wlock) - WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock)); -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); - /** * bdev_nr_zones - Get number of zones * @bdev: Target device @@ -1506,9 +1460,6 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; - kfree(disk->seq_zones_wlock); - disk->seq_zones_wlock = NULL; - disk->zone_capacity = 0; disk->nr_zones = 0; } @@ -1556,7 +1507,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args { struct gendisk *disk; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; unsigned int nr_zones; unsigned int zone_capacity; sector_t sector; @@ -1574,7 +1524,6 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; - swap(disk->seq_zones_wlock, args->seq_zones_wlock); swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); /* @@ -1664,13 +1613,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - if (!args->seq_zones_wlock) { - args->seq_zones_wlock = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->seq_zones_wlock) - return -ENOMEM; - } - /* * Remember the capacity of the first sequential zone and check * if it is constant for all zones. @@ -1712,7 +1654,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, } /** - * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps + * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs * @disk: Target disk * * Helper function for low-level device drivers to check, (re) allocate and @@ -1786,9 +1728,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) } /* - * Install the new bitmaps and update nr_zones only once the queue is - * stopped and all I/Os are completed (i.e. a scheduler is not - * referencing the bitmaps). + * Set the new disk zone parameters only once the queue is frozen and + * all I/Os are completed. */ blk_mq_freeze_queue(q); if (ret > 0) @@ -1799,7 +1740,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk) disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q); - kfree(args.seq_zones_wlock); kfree(args.conv_zones_bitmap); return ret; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 60090c8366fb..89ba6b16fe8b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -54,8 +54,6 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The per-zone write lock is held for this request */ -#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* The request completion needs to be signaled to zone write pluging. */ #define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ @@ -1152,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, } void blk_dump_rq_flags(struct request *, char *); -#ifdef CONFIG_BLK_DEV_ZONED -static inline unsigned int blk_rq_zone_no(struct request *rq) -{ - return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); -} - -static inline unsigned int blk_rq_zone_is_seq(struct request *rq) -{ - return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); -} - -/** - * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. - * @rq: Request to examine. - * - * Note: REQ_OP_ZONE_APPEND requests do not require serialization. - */ -static inline bool blk_rq_is_seq_zoned_write(struct request *rq) -{ - return op_needs_zoned_write_locking(req_op(rq)) && - blk_rq_zone_is_seq(rq); -} - -bool blk_req_needs_zone_write_lock(struct request *rq); -bool blk_req_zone_write_trylock(struct request *rq); -void __blk_req_zone_write_lock(struct request *rq); -void __blk_req_zone_write_unlock(struct request *rq); - -static inline void blk_req_zone_write_lock(struct request *rq) -{ - if (blk_req_needs_zone_write_lock(rq)) - __blk_req_zone_write_lock(rq); -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) - __blk_req_zone_write_unlock(rq); -} - -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return rq->q->disk->seq_zones_wlock && - test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - if (!blk_req_needs_zone_write_lock(rq)) - return true; - return !blk_req_zone_is_write_locked(rq); -} -#else /* CONFIG_BLK_DEV_ZONED */ -static inline bool blk_rq_is_seq_zoned_write(struct request *rq) -{ - return false; -} - -static inline bool blk_req_needs_zone_write_lock(struct request *rq) -{ - return false; -} - -static inline void blk_req_zone_write_lock(struct request *rq) -{ -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ -} -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return false; -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - return true; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - #endif /* BLK_MQ_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ffe0c112b128..5751292fee6a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -297,7 +297,6 @@ enum { BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, - BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 017e9d064177..7e8a68805324 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -177,23 +177,14 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_ZONED /* - * Zoned block device information for request dispatch control. - * nr_zones is the total number of zones of the device. This is always - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones - * bits which indicates if a zone is conventional (bit set) or - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones - * bits which indicates if a zone is write locked, that is, if a write - * request targeting the zone was dispatched. - * - * Reads of this information must be protected with blk_queue_enter() / - * blk_queue_exit(). Modifying this information is only allowed while - * no requests are being processed. See also blk_mq_freeze_queue() and - * blk_mq_unfreeze_queue(). + * Zoned block device information. Reads of this information must be + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this + * information is only allowed while no requests are being processed. + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned int zone_capacity; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; @@ -635,15 +626,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - if (!blk_queue_is_zoned(disk->queue)) - return false; - if (!disk->conv_zones_bitmap) - return true; - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); -} - static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { @@ -677,10 +659,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - return false; -} static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { return 0; @@ -869,11 +847,6 @@ static inline bool bio_straddles_zones(struct bio *bio) disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } -static inline unsigned int bio_zone_is_seq(struct bio *bio) -{ - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); -} - /* * Return how much of the chunk is left to be used for I/O at a given offset. */ -- cgit v1.2.3 From 97abee507b4b71d43dc1c1d3de4739db2c86c0ac Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:27 +0900 Subject: block: Do not force select mq-deadline with CONFIG_BLK_DEV_ZONED Now that zone block device write ordering control does not depend anymore on mq-deadline and zone write locking, there is no need to force select the mq-deadline scheduler when CONFIG_BLK_DEV_ZONED is enabled. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-28-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 9f647149fbee..d47398ae9824 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED config BLK_DEV_ZONED bool "Zoned block device support" - select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables support for ZAC/ZBC/ZNS host-managed and host-aware zoned block -- cgit v1.2.3 From 99a9476b27e89525cef653b91e542baf61f105d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:28 +0900 Subject: block: Do not special-case plugging of zone write operations With the block layer zone write plugging being automatically done for any write operation to a zone of a zoned block device, a regular request plugging handled through current->plug can only ever see at most a single write request per zone. In such case, any potential reordering of the plugged requests will be harmless. We can thus remove the special casing for write operations to zones and have these requests plugged as well. This allows removing the function blk_mq_plug and instead directly using current->plug where needed. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-29-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ------ block/blk-merge.c | 3 +-- block/blk-mq.c | 7 +------ block/blk-mq.h | 31 ------------------------------- include/linux/blkdev.h | 12 ------------ 5 files changed, 2 insertions(+), 57 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e1a5344c2257..47400a4fe851 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -907,12 +907,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; - /* - * As the requests that require a zone lock are not plugged in the - * first place, directly accessing the plug instead of using - * blk_mq_plug() should not have any consequences during flushing for - * zoned devices. - */ blk_flush_plug(current->plug, false); /* diff --git a/block/blk-merge.c b/block/blk-merge.c index 7f8a808b74c1..f64115d72f3d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -1113,10 +1113,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct blk_plug *plug; + struct blk_plug *plug = current->plug; struct request *rq; - plug = blk_mq_plug(bio); if (!plug || rq_list_empty(plug->mq_list)) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f2d9970eeba..434d45219e23 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1330,11 +1330,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) blk_account_io_start(rq); - /* - * As plugging can be enabled for passthrough requests on a zoned - * device, directly accessing the plug instead of using blk_mq_plug() - * should not have any consequences. - */ if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; @@ -2932,7 +2927,7 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct blk_plug *plug = blk_mq_plug(bio); + struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs = 1; diff --git a/block/blk-mq.h b/block/blk-mq.h index f75a9ecfebde..260beea8e332 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } -/* - * blk_mq_plug() - Get caller context plug - * @bio : the bio being submitted by the caller context - * - * Plugging, by design, may delay the insertion of BIOs into the elevator in - * order to increase BIO merging opportunities. This however can cause BIO - * insertion order to change from the order in which submit_bio() is being - * executed in the case of multiple contexts concurrently issuing BIOs to a - * device, even if these context are synchronized to tightly control BIO issuing - * order. While this is not a problem with regular block devices, this ordering - * change can cause write BIO failures with zoned block devices as these - * require sequential write patterns to zones. Prevent this from happening by - * ignoring the plug state of a BIO issuing context if it is for a zoned block - * device and the BIO to plug is a write operation. - * - * Return current->plug if the bio can be plugged and NULL otherwise - */ -static inline struct blk_plug *blk_mq_plug( struct bio *bio) -{ - /* Zoned block device write operation case: do not plug the BIO */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) - return NULL; - - /* - * For regular block devices or read operations, use the context plug - * which may be NULL if blk_start_plug() was not executed. - */ - return current->plug; -} - /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e8a68805324..1ce8ba08e318 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1299,18 +1299,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) return disk_zone_no(bdev->bd_disk, sec); } -/* Whether write serialization is required for @op on zoned devices. */ -static inline bool op_needs_zoned_write_locking(enum req_op op) -{ - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; -} - -static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - enum req_op op) -{ - return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); -} - static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 8294d49adbb06d7df8cfaca5a4f4eb9064a91b90 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 19 Apr 2024 10:56:10 +0800 Subject: block/mq-deadline: Remove some unused functions These functions are defined in the mq-deadline.c file, but not called elsewhere, so delete these unused functions. block/mq-deadline.c:134:1: warning: unused function 'deadline_earlier_request'. block/mq-deadline.c:148:1: warning: unused function 'deadline_latter_request'. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8803 Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20240419025610.34298-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index dce8d746b5bd..94eede4fb9eb 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -127,34 +127,6 @@ static u8 dd_rq_ioclass(struct request *rq) return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } -/* - * get the request before `rq' in sector-sorted order - */ -static inline struct request * -deadline_earlier_request(struct request *rq) -{ - struct rb_node *node = rb_prev(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - /* * Return the first request for which blk_rq_pos() >= @pos. */ -- cgit v1.2.3 From a8f59e5a5deaf3e99a8b7252e96cee9af67858a9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 20 Apr 2024 16:58:11 +0900 Subject: block: use a per disk workqueue for zone write plugging A zone write plug BIO work function blk_zone_wplug_bio_work() calls submit_bio_noacct_nocheck() to execute the next unplugged BIO. This function may block. So executing zone plugs BIO works using the block layer global kblockd workqueue can potentially lead to preformance or latency issues as the number of concurrent work for a workqueue is limited to WQ_DFL_ACTIVE (256). 1) For a system with a large number of zoned disks, issuing write requests to otherwise unused zones may be delayed wiating for a work thread to become available. 2) Requeue operations which use kblockd but are independent of zone write plugging may alsoi end up being delayed. To avoid these potential performance issues, create a workqueue per zoned device to execute zone plugs BIO work. The workqueue max active parameter is set to the maximum number of zone write plugs allocated with the zone write plug mempool. This limit is equal to the maximum number of open zones of the disk and defaults to 128 for disks that do not have a limit on the number of open zones. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240420075811.1276893-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 32 ++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3befebe6b319..3a796420f240 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1129,7 +1129,7 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, /* Schedule submission of the next plugged BIO if we have one. */ if (!bio_list_empty(&zwplug->bio_list)) { spin_unlock_irqrestore(&zwplug->lock, flags); - kblockd_schedule_work(&zwplug->bio_work); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); return; } @@ -1332,7 +1332,7 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk, /* Restart BIO submission if we still have any BIO left. */ if (!bio_list_empty(&zwplug->bio_list)) { WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); - kblockd_schedule_work(&zwplug->bio_work); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); goto unlock; } @@ -1409,14 +1409,25 @@ static int disk_alloc_zone_resources(struct gendisk *disk, disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, sizeof(struct blk_zone_wplug)); - if (!disk->zone_wplugs_pool) { - kfree(disk->zone_wplugs_hash); - disk->zone_wplugs_hash = NULL; - disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; - } + if (!disk->zone_wplugs_pool) + goto free_hash; + + disk->zone_wplugs_wq = + alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, + pool_size, disk->disk_name); + if (!disk->zone_wplugs_wq) + goto destroy_pool; return 0; + +destroy_pool: + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; +free_hash: + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; + return -ENOMEM; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1447,6 +1458,11 @@ void disk_free_zone_resources(struct gendisk *disk) { cancel_work_sync(&disk->zone_wplugs_work); + if (disk->zone_wplugs_wq) { + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; + } + disk_destroy_zone_wplugs_hash_table(disk); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ce8ba08e318..d22eb41a05b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,6 +191,7 @@ struct gendisk { struct hlist_head *zone_wplugs_hash; struct list_head zone_wplugs_err_list; struct work_struct zone_wplugs_work; + struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) -- cgit v1.2.3 From 57787fa42f9fc12fe18938eefc2acb2dc2bde9ae Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 25 Apr 2024 05:02:39 -0700 Subject: block: check if zone_wplugs_hash exists in queue_zone_wplugs_show Changhui reported a kernel crash when running this simple shell reproducer: # cd /sys/kernel/debug/block && find . -type f -exec grep -aH . {} \; The above results in a NULL pointer dereference if a device does not have a zone_wplugs_hash allocated. To fix this, return early if we don't have a zone_wplugs_hash. Reported-by: Changhui Zhong Fixes: a98b05b02f0f ("block: Replace zone_wlock debugfs entry with zone_wplugs entry") Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/e5fec079dfca448cc21c425cfa5d7b291f5faa67.1714046443.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3a796420f240..bad68277c0b2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1774,6 +1774,9 @@ int queue_zone_wplugs_show(void *data, struct seq_file *m) unsigned int zwp_bio_list_size, i; unsigned long flags; + if (!disk->zone_wplugs_hash) + return 0; + rcu_read_lock(); for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { hlist_for_each_entry_rcu(zwplug, -- cgit v1.2.3 From 6b7593b5fb9eb73be92f78a1abfa502f05ff5e15 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:08:55 +0900 Subject: block: Exclude conventional zones when faking max open limit For a device that has no limits for the maximum number of open and active zones, we default to using the number of zones, limited to BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE (128), for the maximum number of open zones indicated to the user. However, for a device that has conventional zones and less zones than BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, we should not account conventional zones and set the limit to the number of sequential write required zones. Furthermore, for cases where the limit is equal to the number of sequential write required zones, we can advertize a limit of 0 to indicate "no limits". Fix this by moving the zone write plug mempool resizing from disk_revalidate_zone_resources() to disk_update_zone_resources() where we can safely compute the number of conventional zones and update the limits. Fixes: 843283e96e5a ("block: Fake max open zones limit when there is no limit") Reported-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index bad68277c0b2..731d1abb80f6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1513,10 +1513,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_hash) return disk_alloc_zone_resources(disk, pool_size); - /* Resize the zone write plug memory pool if needed. */ - if (disk->zone_wplugs_pool->min_nr != pool_size) - return mempool_resize(disk->zone_wplugs_pool, pool_size); - return 0; } @@ -1536,11 +1532,24 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; + unsigned int nr_seq_zones, nr_conv_zones = 0; + unsigned int pool_size; struct queue_limits lim; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); + if (disk->conv_zones_bitmap) + nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, + disk->nr_zones); + if (nr_conv_zones >= disk->nr_zones) { + pr_warn("%s: Invalid number of conventional zones %u / %u\n", + disk->disk_name, nr_conv_zones, disk->nr_zones); + return -ENODEV; + } + + if (!disk->zone_wplugs_pool) + return 0; /* * If the device has no limit on the maximum number of open and active @@ -1549,14 +1558,23 @@ static int disk_update_zone_resources(struct gendisk *disk, * dynamic zone write plug allocation when simultaneously writing to * more zones than the size of the mempool. */ - if (disk->zone_wplugs_pool) { - lim = queue_limits_start_update(q); - if (!lim.max_open_zones && !lim.max_active_zones) - lim.max_open_zones = disk->zone_wplugs_pool->min_nr; - return queue_limits_commit_update(q, &lim); + lim = queue_limits_start_update(q); + + nr_seq_zones = disk->nr_zones - nr_conv_zones; + pool_size = max(lim.max_open_zones, lim.max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); + + mempool_resize(disk->zone_wplugs_pool, pool_size); + + if (!lim.max_open_zones && !lim.max_active_zones) { + if (pool_size < nr_seq_zones) + lim.max_open_zones = pool_size; + else + lim.max_open_zones = 0; } - return 0; + return queue_limits_commit_update(q, &lim); } /* -- cgit v1.2.3 From 74b7ae5f48e6f9518a32f50926619eba54be44de Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:08:56 +0900 Subject: block: Fix zone write plug initialization from blk_revalidate_zone_cb() When revalidating the zones of a zoned block device, blk_revalidate_zone_cb() must allocate a zone write plug for any sequential write required zone that is not empty nor full. However, the current code tests the latter case by comparing the zone write pointer offset to the zone size instead of the zone capacity. Furthermore, disk_get_and_lock_zone_wplug() is called with a sector argument equal to the zone start instead of the current zone write pointer position. This commit fixes both issues by calling disk_get_and_lock_zone_wplug() for a zone that is not empty and with a write pointer offset lower than the zone capacity and use the zone capacity sector as the sector argument for disk_get_and_lock_zone_wplug(). Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 731d1abb80f6..7824bd52c82c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1664,10 +1664,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * empty nor full. So make sure we have a zone write plug for * such zone if the device has a zone write plug hash table. */ + if (!disk->zone_wplugs_hash) + break; wp_offset = blk_zone_wp_offset(zone); - if (disk->zone_wplugs_hash && - wp_offset && wp_offset < zone_sectors) { - zwplug = disk_get_and_lock_zone_wplug(disk, zone->start, + if (wp_offset && wp_offset < zone->capacity) { + zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); if (!zwplug) return -ENOMEM; -- cgit v1.2.3 From 19aad274c22b96fc4c0113d87cc8a083c87c467e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:08:57 +0900 Subject: block: Fix reference counting for zone write plugs in error state When zone is reset or finished, disk_zone_wplug_set_wp_offset() is called to update the zone write plug write pointer offset and to clear the zone error state (BLK_ZONE_WPLUG_ERROR flag) if it is set. However, this processing is missing dropping the reference to the zone write plug that was taken in disk_zone_wplug_set_error() when the error flag was first set. Furthermore, the error state handling must release the zone write plug lock to first execute a report zones command. When the report zone races with a reset or finish operation that clears the error, we can end up decrementing the zone write plug reference count twice: once in disk_zone_wplug_set_wp_offset() for the reset/finish operation and one more time in disk_zone_wplugs_work() once disk_zone_wplug_handle_error() completes. Fix this by introducing disk_zone_wplug_clear_error() as the symmetric function of disk_zone_wplug_set_error(). disk_zone_wplug_clear_error() decrements the zone write plug reference count obtained in disk_zone_wplug_set_error() only if the error handling has not started yet, that is, only if disk_zone_wplugs_work() has not yet taken the zone write plug off the error list. This ensure that either disk_zone_wplug_clear_error() or disk_zone_wplugs_work() drop the zone write plug reference count. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 75 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7824bd52c82c..23ad1de0da62 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -658,6 +658,54 @@ static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, bio_list_merge(&zwplug->bio_list, &bl); } +static inline void disk_zone_wplug_set_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + return; + + /* + * At this point, we already have a reference on the zone write plug. + * However, since we are going to add the plug to the disk zone write + * plugs work list, increase its reference count. This reference will + * be dropped in disk_zone_wplugs_work() once the error state is + * handled, or in disk_zone_wplug_clear_error() if the zone is reset or + * finished. + */ + zwplug->flags |= BLK_ZONE_WPLUG_ERROR; + atomic_inc(&zwplug->ref); + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + +static inline void disk_zone_wplug_clear_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) + return; + + /* + * We are racing with the error handling work which drops the reference + * on the zone write plug after handling the error state. So remove the + * plug from the error list and drop its reference count only if the + * error handling has not yet started, that is, if the zone write plug + * is still listed. + */ + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + if (!list_empty(&zwplug->link)) { + list_del_init(&zwplug->link); + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; + disk_put_zone_wplug(zwplug); + } + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + /* * Set a zone write plug write pointer offset to either 0 (zone reset case) * or to the zone size (zone finish case). This aborts all plugged BIOs, which @@ -691,12 +739,7 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, * in a good state. So clear the error flag and decrement the * error count if we were in error state. */ - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { - zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - spin_lock(&disk->zone_wplugs_lock); - list_del_init(&zwplug->link); - spin_unlock(&disk->zone_wplugs_lock); - } + disk_zone_wplug_clear_error(disk, zwplug); /* * The zone write plug now has no BIO plugged: remove it from the @@ -885,26 +928,6 @@ void blk_zone_write_plug_attempt_merge(struct request *req) spin_unlock_irqrestore(&zwplug->lock, flags); } -static inline void disk_zone_wplug_set_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) { - unsigned long flags; - - /* - * Increase the plug reference count. The reference will be - * dropped in disk_zone_wplugs_work() once the error state - * is handled. - */ - zwplug->flags |= BLK_ZONE_WPLUG_ERROR; - atomic_inc(&zwplug->ref); - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - } -} - /* * Check and prepare a BIO for submission by incrementing the write pointer * offset of its zone write plug and changing zone append operations into -- cgit v1.2.3 From 9e78c38ab30b14c1d6a07c61d57ac5e2f12fa568 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:08:58 +0900 Subject: block: Hold a reference on zone write plugs to schedule submission Since a zone write plug BIO work is a field of struct blk_zone_wplug, we must ensure that a zone write plug is never freed when its BIO submission work is queued or running. Do this by holding a reference on the zone write plug when the submission work is scheduled for execution with queue_work() and releasing the reference at the end of the execution of the work function blk_zone_wplug_bio_work(). The helper function disk_zone_wplug_schedule_bio_work() is introduced to get a reference on a zone write plug and queue its work. This helper is used in disk_zone_wplug_unplug_bio() and disk_zone_wplug_handle_error(). Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-6-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23ad1de0da62..78557f810f1d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1132,6 +1132,19 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) } EXPORT_SYMBOL_GPL(blk_zone_plug_bio); +static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* + * Take a reference on the zone write plug and schedule the submission + * of the next plugged BIO. blk_zone_wplug_bio_work() will release the + * reference we take here. + */ + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + atomic_inc(&zwplug->ref); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); +} + static void disk_zone_wplug_unplug_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -1151,8 +1164,8 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, /* Schedule submission of the next plugged BIO if we have one. */ if (!bio_list_empty(&zwplug->bio_list)) { + disk_zone_wplug_schedule_bio_work(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); return; } @@ -1252,14 +1265,14 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - return; + goto put_zwplug; } if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { /* Error recovery will decide what to do with the BIO. */ bio_list_add_head(&zwplug->bio_list, bio); spin_unlock_irqrestore(&zwplug->lock, flags); - return; + goto put_zwplug; } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1275,6 +1288,10 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) */ if (bdev->bd_has_submit_bio) blk_queue_exit(bdev->bd_disk->queue); + +put_zwplug: + /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ + disk_put_zone_wplug(zwplug); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -1354,8 +1371,7 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk, /* Restart BIO submission if we still have any BIO left. */ if (!bio_list_empty(&zwplug->bio_list)) { - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); + disk_zone_wplug_schedule_bio_work(disk, zwplug); goto unlock; } -- cgit v1.2.3 From 79ae35a4233df5909f8bea0b64eadbebde870de2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:08:59 +0900 Subject: block: Unhash a zone write plug only if needed Fix disk_remove_zone_wplug() to ensure that a zone write plug already removed from a disk hash table of zone write plugs is not removed again. Do this by checking the BLK_ZONE_WPLUG_UNHASHED flag of the plug and calling hlist_del_init_rcu() only if the flag is not set. Furthermore, since BIO completions can happen at any time, that is, decrementing of the zone write plug reference count can happen at any time, make sure to use disk_put_zone_wplug() instead of atomic_dec() to ensure that the zone write plug is freed when its last reference is dropped. In order to do this, disk_remove_zone_wplug() is moved after the definition of disk_put_zone_wplug(). disk_should_remove_zone_wplug() is moved as well to keep it together with disk_remove_zone_wplug(). To be consistent with this change, add a check in disk_put_zone_wplug() to ensure that a zone write plug being freed was already removed from the disk hash table. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-7-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 55 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 78557f810f1d..2f61ba56dad2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -476,29 +476,6 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, return true; } -static void disk_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned long flags; - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; - atomic_dec(&zwplug->ref); - hlist_del_init_rcu(&zwplug->node); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - -static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - /* If the zone is still busy, the plug cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) - return false; - - /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; -} - static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, sector_t sector) { @@ -534,11 +511,43 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) if (atomic_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); WARN_ON_ONCE(!list_empty(&zwplug->link)); + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } } +static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* If the zone is still busy, the plug cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + return false; + + /* We can remove zone write plugs for zones that are empty or full. */ + return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; +} + +static void disk_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + /* If the zone write plug was already removed, we have nothing to do. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return; + + /* + * Mark the zone write plug as unhashed and drop the extra reference we + * took when the plug was inserted in the hash table. + */ + zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_del_init_rcu(&zwplug->node); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + disk_put_zone_wplug(zwplug); +} + static void blk_zone_wplug_bio_work(struct work_struct *work); /* -- cgit v1.2.3 From 7b295187287e0006dd1b0b95f995f00878e436c5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:00 +0900 Subject: block: Do not remove zone write plugs still in use Large write BIOs that span a zone boundary are split in blk_mq_submit_bio() before being passed to blk_zone_plug_bio() for zone write plugging. Such split BIO will be chained with one fragment targeting one zone and the remainder of the BIO targeting the next zone. The two BIOs can be executed in parallel, without a predetermine order relative to eachother and their completion may be reversed: the remainder first completing and the first fragment then completing. In such case, bio_endio() will not immediately execute blk_zone_write_plug_bio_endio() for the parent BIO (the remainder of the split BIO) as the BIOs are chained. blk_zone_write_plug_bio_endio() for the parent BIO will be executed only once the first fragment completes. In the case of a device with small zones and very large BIOs, uch completion pattern can lead to disk_should_remove_zone_wplug() to return true for the zone of the parent BIO when the parent BIO request completes and blk_zone_write_plug_complete_request() is executed. This triggers the removal of the zone write plug from the hash table using disk_remove_zone_wplug(). With the zone write plug of the parent BIO missing, the call to disk_get_zone_wplug() in blk_zone_write_plug_bio_endio() returns NULL and triggers a warning. This patterns can be recreated fairly easily using a scsi_debug device with small zone and btrfs. E.g. modprobe scsi_debug delay=0 dev_size_mb=1024 sector_size=4096 \ zbc=host-managed zone_cap_mb=3 zone_nr_conv=0 zone_size_mb=4 mkfs.btrfs -f -O zoned /dev/sda mount -t btrfs /dev/sda /mnt fio --name=wrtest --rw=randwrite --direct=1 --ioengine=libaio \ --bs=4k --iodepth=16 --size=1M --directory=/mnt --time_based \ --runtime=10 umount /dev/sda Will result in the warning: [ 29.035538] WARNING: CPU: 3 PID: 37 at block/blk-zoned.c:1207 blk_zone_write_plug_bio_endio+0xee/0x1e0 ... [ 29.058682] Call Trace: [ 29.059095] [ 29.059473] ? __warn+0x80/0x120 [ 29.059983] ? blk_zone_write_plug_bio_endio+0xee/0x1e0 [ 29.060728] ? report_bug+0x160/0x190 [ 29.061283] ? handle_bug+0x36/0x70 [ 29.061830] ? exc_invalid_op+0x17/0x60 [ 29.062399] ? asm_exc_invalid_op+0x1a/0x20 [ 29.063025] ? blk_zone_write_plug_bio_endio+0xee/0x1e0 [ 29.063760] bio_endio+0xb7/0x150 [ 29.064280] btrfs_clone_write_end_io+0x2b/0x60 [btrfs] [ 29.065049] blk_update_request+0x17c/0x500 [ 29.065666] scsi_end_request+0x27/0x1a0 [scsi_mod] [ 29.066356] scsi_io_completion+0x5b/0x690 [scsi_mod] [ 29.067077] blk_complete_reqs+0x3a/0x50 [ 29.067692] __do_softirq+0xcf/0x2b3 [ 29.068248] ? sort_range+0x20/0x20 [ 29.068791] run_ksoftirqd+0x1c/0x30 [ 29.069339] smpboot_thread_fn+0xcc/0x1b0 [ 29.069936] kthread+0xcf/0x100 [ 29.070438] ? kthread_complete_and_exit+0x20/0x20 [ 29.071314] ret_from_fork+0x31/0x50 [ 29.071873] ? kthread_complete_and_exit+0x20/0x20 [ 29.072563] ret_from_fork_asm+0x11/0x20 [ 29.073146] either when fio executes or when unmount is executed. Fix this by modifying disk_should_remove_zone_wplug() to check that the reference count to a zone write plug is not larger than 2, that is, that the only references left on the zone are the caller held reference (blk_zone_write_plug_complete_request()) and the initial extra reference for the zone write plug taken when it was initialized (and that is dropped when the zone write plug is removed from the hash table). To be consistent with this change, make sure to drop the request or BIO held reference to the zone write plug before calling disk_zone_wplug_unplug_bio(). All references are also dropped using disk_put_zone_wplug() instead of atomic_dec() to ensure that the zone write plug is freed if it needs to be. Comments are also improved to clarify zone write plugs reference handling. Reported-by: Shin'ichiro Kawasaki Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240501110907.96950-8-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 2f61ba56dad2..1e5f362f0409 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -520,10 +520,28 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - /* If the zone is still busy, the plug cannot be removed. */ + /* If the zone write plug was already removed, we are done. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return false; + + /* If the zone write plug is still busy, it cannot be removed. */ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) return false; + /* + * Completions of BIOs with blk_zone_write_plug_bio_endio() may + * happen after handling a request completion with + * blk_zone_write_plug_complete_request() (e.g. with split BIOs + * that are chained). In such case, disk_zone_wplug_unplug_bio() + * should not attempt to remove the zone write plug until all BIO + * completions are seen. Check by looking at the zone write plug + * reference count, which is 2 when the plug is unused (one reference + * taken when the plug was allocated and another reference taken by the + * caller context). + */ + if (atomic_read(&zwplug->ref) > 2) + return false; + /* We can remove zone write plugs for zones that are empty or full. */ return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; } @@ -893,8 +911,9 @@ void blk_zone_write_plug_attempt_merge(struct request *req) struct bio *bio; /* - * Completion of this request needs to be handled with - * blk_zone_write_plug_complete_request(). + * Indicate that completion of this request needs to be handled with + * blk_zone_write_plug_complete_request(), which will drop the reference + * on the zone write plug we took above on entry to this function. */ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; @@ -1223,6 +1242,9 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) spin_unlock_irqrestore(&zwplug->lock, flags); } + /* Drop the reference we took when the BIO was issued. */ + disk_put_zone_wplug(zwplug); + /* * For BIO-based devices, blk_zone_write_plug_complete_request() * is not called. So we need to schedule execution of the next @@ -1231,8 +1253,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) if (bio->bi_bdev->bd_has_submit_bio) disk_zone_wplug_unplug_bio(disk, zwplug); - /* Drop the reference we took when the BIO was issued. */ - atomic_dec(&zwplug->ref); + /* Drop the reference we took when entering this function. */ disk_put_zone_wplug(zwplug); } @@ -1246,13 +1267,15 @@ void blk_zone_write_plug_complete_request(struct request *req) req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; - disk_zone_wplug_unplug_bio(disk, zwplug); - /* * Drop the reference we took when the request was initialized in * blk_zone_write_plug_attempt_merge(). */ - atomic_dec(&zwplug->ref); + disk_put_zone_wplug(zwplug); + + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when entering this function. */ disk_put_zone_wplug(zwplug); } -- cgit v1.2.3 From af147b740f111730c2e387ee6c0ac3ada7d51117 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:01 +0900 Subject: block: Fix flush request sector restore Make sure that a request bio is not NULL before trying to restore the request start sector. Reported-by: Yi Zhang Fixes: 6f8fd758de63 ("block: Restore sector of flush requests") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-9-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 2f58ae018464..c17cf8ed8113 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,7 +130,8 @@ static void blk_flush_restore_request(struct request *rq) * original @rq->bio. Restore it. */ rq->bio = rq->biotail; - rq->__sector = rq->bio->bi_iter.bi_sector; + if (rq->bio) + rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; -- cgit v1.2.3 From 096bc7ea335bc5dfbaed1d005ff27f008ec9d710 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:02 +0900 Subject: block: Fix handling of non-empty flush write requests to zones Zone write plugging ignores empty (no data) flush operations but handles flush BIOs that have data to ensure that the flush machinery generated write is processed in order. However, the call to blk_zone_write_plug_attempt_merge() which sets a request RQF_ZONE_WRITE_PLUGGING flag is called after blk_insert_flush(), thus missing indicating that a non empty flush request completion needs handling by zone write plugging. Fix this by moving the call to blk_zone_write_plug_attempt_merge() before blk_insert_flush(). And while at it, rename that function as blk_zone_write_plug_init_request() to be clear that it is not just about merging plugged BIOs in the request. While at it, also add a WARN_ONCE() check that the zone write plug for the request is not NULL. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-10-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- block/blk-zoned.c | 12 ++++++++---- block/blk.h | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 434d45219e23..0fae9bd0ecd4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3001,12 +3001,12 @@ new_request: return; } + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_init_request(rq); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; - if (bio_zone_write_plugging(bio)) - blk_zone_write_plug_attempt_merge(rq); - if (plug) { blk_add_rq_to_plug(plug, rq); return; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1e5f362f0409..cd0049f5bf2f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -874,8 +874,9 @@ void blk_zone_write_plug_bio_merged(struct bio *bio) /* * If the BIO was already plugged, then we were called through - * blk_zone_write_plug_attempt_merge() -> blk_attempt_bio_merge(). - * For this case, blk_zone_write_plug_attempt_merge() will handle the + * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). + * For this case, we already hold a reference on the zone write plug for + * the BIO and blk_zone_write_plug_init_request() will handle the * zone write pointer offset update. */ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) @@ -899,7 +900,7 @@ void blk_zone_write_plug_bio_merged(struct bio *bio) * already went through zone write plugging (either a new BIO or one that was * unplugged). */ -void blk_zone_write_plug_attempt_merge(struct request *req) +void blk_zone_write_plug_init_request(struct request *req) { sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; @@ -910,6 +911,9 @@ void blk_zone_write_plug_attempt_merge(struct request *req) unsigned long flags; struct bio *bio; + if (WARN_ON_ONCE(!zwplug)) + return; + /* * Indicate that completion of this request needs to be handled with * blk_zone_write_plug_complete_request(), which will drop the reference @@ -1269,7 +1273,7 @@ void blk_zone_write_plug_complete_request(struct request *req) /* * Drop the reference we took when the request was initialized in - * blk_zone_write_plug_attempt_merge(). + * blk_zone_write_plug_init_request(). */ disk_put_zone_wplug(zwplug); diff --git a/block/blk.h b/block/blk.h index 1140c4a0be03..8a62b861453c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -427,7 +427,7 @@ static inline bool bio_is_zone_append(struct bio *bio) bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); } void blk_zone_write_plug_bio_merged(struct bio *bio); -void blk_zone_write_plug_attempt_merge(struct request *rq); +void blk_zone_write_plug_init_request(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { @@ -481,7 +481,7 @@ static inline bool bio_is_zone_append(struct bio *bio) static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } -static inline void blk_zone_write_plug_attempt_merge(struct request *rq) +static inline void blk_zone_write_plug_init_request(struct request *rq) { } static inline void blk_zone_update_request_bio(struct request *rq, -- cgit v1.2.3 From c4c3ffdab2e26780f6f7c9959a473b2c652f4d13 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:03 +0900 Subject: block: Improve blk_zone_write_plug_bio_merged() Improve blk_zone_write_plug_bio_merged() to check that we succefully get a reference on the zone write plug of the merged BIO, as expected since for a merge we already have at least one request and one BIO referencing the zone write plug. Comments in this function are also improved to better explain the references to the BIO zone write plug. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-11-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index cd0049f5bf2f..1890b6d55d8b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -885,11 +885,16 @@ void blk_zone_write_plug_bio_merged(struct bio *bio) bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* - * Increase the plug reference count and advance the zone write - * pointer offset. + * Get a reference on the zone write plug of the target zone and advance + * the zone write pointer offset. Given that this is a merge, we already + * have at least one request and one BIO referencing the zone write + * plug. So this should not fail. */ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); + if (WARN_ON_ONCE(!zwplug)) + return; + spin_lock_irqsave(&zwplug->lock, flags); zwplug->wp_offset += bio_sectors(bio); spin_unlock_irqrestore(&zwplug->lock, flags); -- cgit v1.2.3 From 347bde9da10f410b8134a82d6096105cad44e1c1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:04 +0900 Subject: block: Improve zone write request completion handling blk_zone_complete_request() must be called to handle the completion of a zone write request handled with zone write plugging. This function is called from blk_complete_request(), blk_update_request() and also in blk_mq_submit_bio() error path. Improve this by moving this function call into blk_mq_finish_request() as all requests are processed with this function when they complete as well as when they are freed without being executed. This also improves blk_update_request() used by scsi devices as these may repeatedly call this function to handle partial completions. To be consistent with this change, blk_zone_complete_request() is renamed to blk_zone_finish_request() and blk_zone_write_plug_complete_request() is renamed to blk_zone_write_plug_finish_request(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-12-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++---- block/blk-zoned.c | 11 ++++++----- block/blk.h | 8 ++++---- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0fae9bd0ecd4..9f677ea85a52 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -691,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; + blk_zone_finish_request(rq); + if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* @@ -828,8 +830,6 @@ static void blk_complete_request(struct request *req) bio = next; } while (bio); - blk_zone_complete_request(req); - /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request @@ -940,7 +940,6 @@ bool blk_update_request(struct request *req, blk_status_t error, * completely done */ if (!req->bio) { - blk_zone_complete_request(req); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request @@ -2996,7 +2995,6 @@ new_request: if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); - blk_zone_complete_request(rq); blk_mq_free_request(rq); return; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1890b6d55d8b..759e85e9167c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -531,7 +531,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, /* * Completions of BIOs with blk_zone_write_plug_bio_endio() may * happen after handling a request completion with - * blk_zone_write_plug_complete_request() (e.g. with split BIOs + * blk_zone_write_plug_finish_request() (e.g. with split BIOs * that are chained). In such case, disk_zone_wplug_unplug_bio() * should not attempt to remove the zone write plug until all BIO * completions are seen. Check by looking at the zone write plug @@ -921,7 +921,7 @@ void blk_zone_write_plug_init_request(struct request *req) /* * Indicate that completion of this request needs to be handled with - * blk_zone_write_plug_complete_request(), which will drop the reference + * blk_zone_write_plug_finish_request(), which will drop the reference * on the zone write plug we took above on entry to this function. */ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; @@ -1255,7 +1255,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) disk_put_zone_wplug(zwplug); /* - * For BIO-based devices, blk_zone_write_plug_complete_request() + * For BIO-based devices, blk_zone_write_plug_finish_request() * is not called. So we need to schedule execution of the next * plugged BIO here. */ @@ -1266,11 +1266,12 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) disk_put_zone_wplug(zwplug); } -void blk_zone_write_plug_complete_request(struct request *req) +void blk_zone_write_plug_finish_request(struct request *req) { struct gendisk *disk = req->q->disk; - struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, req->__sector); + struct blk_zone_wplug *zwplug; + zwplug = disk_get_zone_wplug(disk, req->__sector); if (WARN_ON_ONCE(!zwplug)) return; diff --git a/block/blk.h b/block/blk.h index 8a62b861453c..ee4f782d1496 100644 --- a/block/blk.h +++ b/block/blk.h @@ -453,11 +453,11 @@ static inline void blk_zone_bio_endio(struct bio *bio) blk_zone_write_plug_bio_endio(bio); } -void blk_zone_write_plug_complete_request(struct request *rq); -static inline void blk_zone_complete_request(struct request *rq) +void blk_zone_write_plug_finish_request(struct request *rq); +static inline void blk_zone_finish_request(struct request *rq) { if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) - blk_zone_write_plug_complete_request(rq); + blk_zone_write_plug_finish_request(rq); } int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); @@ -491,7 +491,7 @@ static inline void blk_zone_update_request_bio(struct request *rq, static inline void blk_zone_bio_endio(struct bio *bio) { } -static inline void blk_zone_complete_request(struct request *rq) +static inline void blk_zone_finish_request(struct request *rq) { } static inline int blkdev_report_zones_ioctl(struct block_device *bdev, -- cgit v1.2.3 From b5a64ec2ea2be2a7f7eb73c243c2381e9fc1c71b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:05 +0900 Subject: block: Simplify blk_zone_write_plug_bio_endio() We already have the disk variable obtained from the bio when calling disk_get_zone_wplug(). So use that variable instead of dereferencing the bio bdev again for the disk argument of disk_get_zone_wplug(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-13-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 759e85e9167c..132eb988f4d7 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1222,8 +1222,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug = - disk_get_zone_wplug(bio->bi_bdev->bd_disk, - bio->bi_iter.bi_sector); + disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); unsigned long flags; if (WARN_ON_ONCE(!zwplug)) -- cgit v1.2.3 From c9c8aea03c4ac2ea902bc7dd5ba14f5d78af8ece Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:06 +0900 Subject: block: Simplify zone write plug BIO abort When BIOs plugged in a zone write plug are aborted, blk_zone_wplug_bio_io_error() clears the BIO BIO_ZONE_WRITE_PLUGGING flag so that bio_io_error(bio) does not end up calling blk_zone_write_plug_bio_endio() and we thus need to manually drop the reference on the zone write plug held by the aborted BIO. Move the call to disk_put_zone_wplug() that is alwasy following the call to blk_zone_wplug_bio_io_error() inside that function to simplify the code. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240501110907.96950-14-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 132eb988f4d7..15e4e14e16f7 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -634,12 +634,14 @@ again: return zwplug; } -static inline void blk_zone_wplug_bio_io_error(struct bio *bio) +static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, + struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = zwplug->disk->queue; bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); + disk_put_zone_wplug(zwplug); blk_queue_exit(q); } @@ -650,10 +652,8 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; - while ((bio = bio_list_pop(&zwplug->bio_list))) { - blk_zone_wplug_bio_io_error(bio); - disk_put_zone_wplug(zwplug); - } + while ((bio = bio_list_pop(&zwplug->bio_list))) + blk_zone_wplug_bio_io_error(zwplug, bio); } /* @@ -673,8 +673,7 @@ static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, if (wp_offset >= zone_capacity || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { - blk_zone_wplug_bio_io_error(bio); - disk_put_zone_wplug(zwplug); + blk_zone_wplug_bio_io_error(zwplug, bio); continue; } -- cgit v1.2.3 From d7580149efc5c86c4e72f9263b97c062616a84dd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 1 May 2024 20:09:07 +0900 Subject: block: Cleanup blk_revalidate_zone_cb() Define the code for checking conventional and sequential write required zones suing the functions blk_revalidate_conv_zone() and blk_revalidate_seq_zone() respectively. This simplifies the zone type switch-case in blk_revalidate_zone_cb(). No functional changes. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240501110907.96950-15-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 129 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 77 insertions(+), 52 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 15e4e14e16f7..48e5e3bbb89c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1656,6 +1656,74 @@ static int disk_update_zone_resources(struct gendisk *disk, return queue_limits_commit_update(q, &lim); } +static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + struct request_queue *q = disk->queue; + + if (zone->capacity != zone->len) { + pr_warn("%s: Invalid conventional zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + if (!disk_need_zone_resources(disk)) + return 0; + + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + + set_bit(idx, args->conv_zones_bitmap); + + return 0; +} + +static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + struct blk_zone_wplug *zwplug; + unsigned int wp_offset; + unsigned long flags; + + /* + * Remember the capacity of the first sequential zone and check + * if it is constant for all zones. + */ + if (!args->zone_capacity) + args->zone_capacity = zone->capacity; + if (zone->capacity != args->zone_capacity) { + pr_warn("%s: Invalid variable zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + /* + * We need to track the write pointer of all zones that are not + * empty nor full. So make sure we have a zone write plug for + * such zone if the device has a zone write plug hash table. + */ + if (!disk->zone_wplugs_hash) + return 0; + + wp_offset = blk_zone_wp_offset(zone); + if (!wp_offset || wp_offset >= zone->capacity) + return 0; + + zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + if (!zwplug) + return -ENOMEM; + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + + return 0; +} + /* * Helper function to check the validity of zones of a zoned block device. */ @@ -1664,12 +1732,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; - struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); - sector_t zone_sectors = q->limits.chunk_sectors; - struct blk_zone_wplug *zwplug; - unsigned long flags; - unsigned int wp_offset; + sector_t zone_sectors = disk->queue->limits.chunk_sectors; + int ret; /* Check for bad zones and holes in the zone report */ if (zone->start != args->sector) { @@ -1709,62 +1774,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: - if (zone->capacity != zone->len) { - pr_warn("%s: Invalid conventional zone capacity\n", - disk->disk_name); - return -ENODEV; - } - - if (!disk_need_zone_resources(disk)) - break; - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - set_bit(idx, args->conv_zones_bitmap); + ret = blk_revalidate_conv_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - /* - * Remember the capacity of the first sequential zone and check - * if it is constant for all zones. - */ - if (!args->zone_capacity) - args->zone_capacity = zone->capacity; - if (zone->capacity != args->zone_capacity) { - pr_warn("%s: Invalid variable zone capacity\n", - disk->disk_name); - return -ENODEV; - } - - /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. - */ - if (!disk->zone_wplugs_hash) - break; - wp_offset = blk_zone_wp_offset(zone); - if (wp_offset && wp_offset < zone->capacity) { - zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, - GFP_NOIO, &flags); - if (!zwplug) - return -ENOMEM; - spin_unlock_irqrestore(&zwplug->lock, flags); - disk_put_zone_wplug(zwplug); - } - + ret = blk_revalidate_seq_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); - return -ENODEV; + ret = -ENODEV; } - args->sector += zone->len; - return 0; + if (!ret) + args->sector += zone->len; + + return ret; } /** -- cgit v1.2.3 From 140ce28dd3bee8e53acc27f123ae474d69ef66f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 May 2024 15:00:32 +0200 Subject: block: add a disk_has_partscan helper Add a helper to check if partition scanning is enabled instead of open coding the check in a few places. This now always checks for the hidden flag even if all but one of the callers are never reachable for hidden gendisks. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240502130033.1958492-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 ++----- block/partitions/core.c | 5 +---- include/linux/blkdev.h | 13 +++++++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index eb893df56d51..4b85963d09db 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) struct file *file; int ret = 0; - if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) - return -EINVAL; - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && - !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); diff --git a/block/partitions/core.c b/block/partitions/core.c index b11e88c82c8c..37b5f92d07fe 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (disk->flags & GENHD_FL_NO_PART) - return 0; - - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 040a22e0eda0..3b18a40a1fc1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -230,6 +230,19 @@ static inline unsigned int disk_openers(struct gendisk *disk) return atomic_read(&disk->part0->bd_openers); } +/** + * disk_has_partscan - return %true if partition scanning is enabled on a disk + * @disk: disk to check + * + * Returns %true if partitions scanning is enabled for @disk, or %false if + * partition scanning is disabled either permanently or temporarily. + */ +static inline bool disk_has_partscan(struct gendisk *disk) +{ + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. -- cgit v1.2.3 From a4217c6740dc64a3eb6815868a9260825e8c68c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 May 2024 15:00:33 +0200 Subject: block: add a partscan sysfs attribute for disks Userspace had been unknowingly relying on a non-stable interface of kernel internals to determine if partition scanning is enabled for a given disk. Provide a stable interface for this purpose instead. Cc: stable@vger.kernel.org # 6.3+ Depends-on: 140ce28dd3be ("block: add a disk_has_partscan helper") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/linux-block/ZhQJf8mzq_wipkBH@gardel-login/ Link: https://lore.kernel.org/r/20240502130033.1958492-3-hch@lst.de [axboe: add links and commit message from Keith] Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 10 ++++++++++ block/genhd.c | 8 ++++++++ 2 files changed, 18 insertions(+) (limited to 'block') diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 1fe9a553c37b..f0025d1c3d5a 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: devices that support receiving integrity metadata. +What: /sys/block//partscan +Date: May 2024 +Contact: Christoph Hellwig +Description: + The /sys/block//partscan files reports if partition + scanning is enabled for the disk. It returns "1" if partition + scanning is enabled, or "0" if not. The value type is a 32-bit + unsigned integer, but only "0" and "1" are valid values. + + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/block/genhd.c b/block/genhd.c index 4b85963d09db..dec2ee338fb4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1044,6 +1044,12 @@ static ssize_t diskseq_show(struct device *dev, return sprintf(buf, "%llu\n", disk->diskseq); } +static ssize_t partscan_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1057,6 +1063,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); +static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1103,6 +1110,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, + &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif -- cgit v1.2.3 From 0c12028aec837f5a002009bbf68d179d506510e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 May 2024 10:10:42 +0200 Subject: block: refine the EOF check in blkdev_iomap_begin blkdev_iomap_begin rounds down the offset to the logical block size before stashing it in iomap->offset and checking that it still is inside the inode size. Check the i_size check to the raw pos value so that we don't try a zero size write if iter->pos is unaligned. Fixes: 487c607df790 ("block: use iomap for writes to block devices") Reported-by: syzbot+0a3683a0a6fecf909244@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Tested-by: syzbot+0a3683a0a6fecf909244@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20240503081042.2078062-1-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index c091ea43bca3..5159ef3a1948 100644 --- a/block/fops.c +++ b/block/fops.c @@ -385,7 +385,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); - if (iomap->offset >= isize) + if (offset >= isize) return -EIO; iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; -- cgit v1.2.3 From bc2e07dfd2c49aaa4b52302cf7b55cf94e025f79 Mon Sep 17 00:00:00 2001 From: INAGAKI Hiroshi Date: Sun, 21 Apr 2024 16:39:52 +0900 Subject: block: fix and simplify blkdevparts= cmdline parsing Fix the cmdline parsing of the "blkdevparts=" parameter using strsep(), which makes the code simpler. Before commit 146afeb235cc ("block: use strscpy() to instead of strncpy()"), we used a strncpy() to copy a block device name and partition names. The commit simply replaced a strncpy() and NULL termination with a strscpy(). It did not update calculations of length passed to strscpy(). While the length passed to strncpy() is just a length of valid characters without NULL termination ('\0'), strscpy() takes it as a length of the destination buffer, including a NULL termination. Since the source buffer is not necessarily NULL terminated, the current code copies "length - 1" characters and puts a NULL character in the destination buffer. It replaces the last character with NULL and breaks the parsing. As an example, that buffer will be passed to parse_parts() and breaks parsing sub-partitions due to the missing ')' at the end, like the following. example (Check Point V-80 & OpenWrt): - Linux Kernel 6.6 [ 0.000000] Kernel command line: console=ttyS0,115200 earlycon=uart8250,mmio32,0xf0512000 crashkernel=30M mvpp2x.queue_mode=1 blkdevparts=mmcblk1:48M@10M(kernel-1),1M(dtb-1),720M(rootfs-1),48M(kernel-2),1M(dtb-2),720M(rootfs-2),300M(default_sw),650M(logs),1M(preset_cfg),1M(adsl),-(storage) maxcpus=4 ... [ 0.884016] mmc1: new HS200 MMC card at address 0001 [ 0.889951] mmcblk1: mmc1:0001 004GA0 3.69 GiB [ 0.895043] cmdline partition format is invalid. [ 0.895704] mmcblk1: p1 [ 0.903447] mmcblk1boot0: mmc1:0001 004GA0 2.00 MiB [ 0.908667] mmcblk1boot1: mmc1:0001 004GA0 2.00 MiB [ 0.913765] mmcblk1rpmb: mmc1:0001 004GA0 512 KiB, chardev (248:0) 1. "48M@10M(kernel-1),..." is passed to strscpy() with length=17 from parse_parts() 2. strscpy() returns -E2BIG and the destination buffer has "48M@10M(kernel-1\0" 3. "48M@10M(kernel-1\0" is passed to parse_subpart() 4. parse_subpart() fails to find ')' when parsing a partition name, and returns error - Linux Kernel 6.1 [ 0.000000] Kernel command line: console=ttyS0,115200 earlycon=uart8250,mmio32,0xf0512000 crashkernel=30M mvpp2x.queue_mode=1 blkdevparts=mmcblk1:48M@10M(kernel-1),1M(dtb-1),720M(rootfs-1),48M(kernel-2),1M(dtb-2),720M(rootfs-2),300M(default_sw),650M(logs),1M(preset_cfg),1M(adsl),-(storage) maxcpus=4 ... [ 0.953142] mmc1: new HS200 MMC card at address 0001 [ 0.959114] mmcblk1: mmc1:0001 004GA0 3.69 GiB [ 0.964259] mmcblk1: p1(kernel-1) p2(dtb-1) p3(rootfs-1) p4(kernel-2) p5(dtb-2) 6(rootfs-2) p7(default_sw) p8(logs) p9(preset_cfg) p10(adsl) p11(storage) [ 0.979174] mmcblk1boot0: mmc1:0001 004GA0 2.00 MiB [ 0.984674] mmcblk1boot1: mmc1:0001 004GA0 2.00 MiB [ 0.989926] mmcblk1rpmb: mmc1:0001 004GA0 512 KiB, chardev (248:0 By the way, strscpy() takes a length of destination buffer and it is often confusing when copying characters with a specified length. Using strsep() helps to separate the string by the specified character. Then, we can use strscpy() naturally with the size of the destination buffer. Separating the string on the fly is also useful to omit the redundant string copy, reducing memory usage and improve the code readability. Fixes: 146afeb235cc ("block: use strscpy() to instead of strncpy()") Suggested-by: Naohiro Aota Signed-off-by: INAGAKI Hiroshi Reviewed-by: Daniel Golle Link: https://lore.kernel.org/r/20240421074005.565-1-musashino.open@gmail.com Signed-off-by: Jens Axboe --- block/partitions/cmdline.c | 49 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index c03bc105e575..152c85df92b2 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) } if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); + partdef++; + char *next = strsep(&partdef, ")"); if (!next) { pr_warn("cmdline partition format is invalid."); @@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) goto fail; } - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strscpy(new_subpart->name, partdef, length); - - partdef = ++next; + strscpy(new_subpart->name, next, sizeof(new_subpart->name)); } else new_subpart->name[0] = '\0'; @@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts) } } -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +static int parse_parts(struct cmdline_parts **parts, char *bdevdef) { int ret = -EINVAL; char *next; - int length; struct cmdline_subpart **next_subpart; struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; *parts = NULL; @@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) if (!newparts) return -ENOMEM; - next = strchr(bdevdef, ':'); + next = strsep(&bdevdef, ":"); if (!next) { pr_warn("cmdline partition has no block device."); goto fail; } - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strscpy(newparts->name, bdevdef, length); + strscpy(newparts->name, next, sizeof(newparts->name)); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strscpy(buf, bdevdef, length); - - ret = parse_subpart(next_subpart, buf); + while ((next = strsep(&bdevdef, ","))) { + ret = parse_subpart(next_subpart, next); if (ret) goto fail; @@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts, *parts = NULL; - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + pbuf = buf = kstrdup(cmdline, GFP_KERNEL); if (!buf) return -ENOMEM; next_parts = parts; - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); + while ((next = strsep(&pbuf, ";"))) { + ret = parse_parts(next_parts, next); if (ret) goto fail; - if (next) - pbuf = ++next; - next_parts = &(*next_parts)->next_parts; } @@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts; static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { - int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; @@ -262,9 +239,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, info = &state->parts[slot].info; - label_min = min_t(int, sizeof(info->volname) - 1, - sizeof(subpart->name)); - strscpy(info->volname, subpart->name, label_min); + strscpy(info->volname, subpart->name, sizeof(info->volname)); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); -- cgit v1.2.3 From ccb326b5f9e623eb7f130fbbf2505ec0e2dcaff9 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 7 May 2024 03:53:49 +0000 Subject: block/ioctl: prefer different overflow check Running syzkaller with the newly reintroduced signed integer overflow sanitizer shows this report: [ 62.982337] ------------[ cut here ]------------ [ 62.985692] cgroup: Invalid name [ 62.986211] UBSAN: signed-integer-overflow in ../block/ioctl.c:36:46 [ 62.989370] 9pnet_fd: p9_fd_create_tcp (7343): problem connecting socket to 127.0.0.1 [ 62.992992] 9223372036854775807 + 4095 cannot be represented in type 'long long' [ 62.997827] 9pnet_fd: p9_fd_create_tcp (7345): problem connecting socket to 127.0.0.1 [ 62.999369] random: crng reseeded on system resumption [ 63.000634] GUP no longer grows the stack in syz-executor.2 (7353): 20002000-20003000 (20001000) [ 63.000668] CPU: 0 PID: 7353 Comm: syz-executor.2 Not tainted 6.8.0-rc2-00035-gb3ef86b5a957 #1 [ 63.000677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 63.000682] Call Trace: [ 63.000686] [ 63.000731] dump_stack_lvl+0x93/0xd0 [ 63.000919] __get_user_pages+0x903/0xd30 [ 63.001030] __gup_longterm_locked+0x153e/0x1ba0 [ 63.001041] ? _raw_read_unlock_irqrestore+0x17/0x50 [ 63.001072] ? try_get_folio+0x29c/0x2d0 [ 63.001083] internal_get_user_pages_fast+0x1119/0x1530 [ 63.001109] iov_iter_extract_pages+0x23b/0x580 [ 63.001206] bio_iov_iter_get_pages+0x4de/0x1220 [ 63.001235] iomap_dio_bio_iter+0x9b6/0x1410 [ 63.001297] __iomap_dio_rw+0xab4/0x1810 [ 63.001316] iomap_dio_rw+0x45/0xa0 [ 63.001328] ext4_file_write_iter+0xdde/0x1390 [ 63.001372] vfs_write+0x599/0xbd0 [ 63.001394] ksys_write+0xc8/0x190 [ 63.001403] do_syscall_64+0xd4/0x1b0 [ 63.001421] ? arch_exit_to_user_mode_prepare+0x3a/0x60 [ 63.001479] entry_SYSCALL_64_after_hwframe+0x6f/0x77 [ 63.001535] RIP: 0033:0x7f7fd3ebf539 [ 63.001551] Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 [ 63.001562] RSP: 002b:00007f7fd32570c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 63.001584] RAX: ffffffffffffffda RBX: 00007f7fd3ff3f80 RCX: 00007f7fd3ebf539 [ 63.001590] RDX: 4db6d1e4f7e43360 RSI: 0000000020000000 RDI: 0000000000000004 [ 63.001595] RBP: 00007f7fd3f1e496 R08: 0000000000000000 R09: 0000000000000000 [ 63.001599] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 63.001604] R13: 0000000000000006 R14: 00007f7fd3ff3f80 R15: 00007ffd415ad2b8 ... [ 63.018142] ---[ end trace ]--- Historically, the signed integer overflow sanitizer did not work in the kernel due to its interaction with `-fwrapv` but this has since been changed [1] in the newest version of Clang; It was re-enabled in the kernel with Commit 557f8c582a9ba8ab ("ubsan: Reintroduce signed overflow sanitizer"). Let's rework this overflow checking logic to not actually perform an overflow during the check itself, thus avoiding the UBSAN splat. [1]: https://github.com/llvm/llvm-project/pull/82432 Signed-off-by: Justin Stitt Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240507-b4-sio-block-ioctl-v3-1-ba0c2b32275e@google.com Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 0c76137adcaa..06ff3023e2a1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -33,7 +33,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); - if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) -- cgit v1.2.3 From 0942592045782e76a9d52c409955c2dc313cbd30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 May 2024 06:20:22 +0200 Subject: block: remove the discard_granularity check in __blkdev_issue_discard We now set a default granularity in the queue limits API, so don't bother with this extra check. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index a6954eafb8c8..7ec3e170e7f6 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -46,13 +46,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; - /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { - pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", - bdev); - return -EOPNOTSUPP; - } - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; -- cgit v1.2.3 From 30f1e724142242a453f92d90b33e030014900bf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 May 2024 06:20:23 +0200 Subject: block: move discard checks into the ioctl handler Most bio operations get basic sanity checking in submit_bio and anything more complicated than that is done in the callers. Discards are a bit different from that in that a lot of checking is done in __blkdev_issue_discard, and the specific errnos for that are returned to userspace. Move the checks that require specific errnos to the ioctl handler instead, and just leave the basic sanity checking in submit_bio for the other handlers. This introduces two changes in behavior: 1) the logical block size alignment check of the start and len is lost for non-ioctl callers. This matches what is done for other operations including reads and writes. We should probably verify this for all bios, but for now make discards match the normal flow. 2) for non-ioctl callers all errors are reported on I/O completion now instead of synchronously. Callers in general mostly ignore or log errors so this will actually simplify the code once cleaned up Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 13 ------------- block/ioctl.c | 7 +++++-- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 7ec3e170e7f6..6e54ef140bab 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -39,19 +39,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct bio *bio = *biop; - sector_t bs_mask; - - if (bdev_read_only(bdev)) - return -EPERM; - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - if (!nr_sects) - return -EINVAL; while (nr_sects) { sector_t req_sects = diff --git a/block/ioctl.c b/block/ioctl.c index 06ff3023e2a1..49fa02b17ec1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -95,6 +95,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev, static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; uint64_t range[2]; uint64_t start, len; struct inode *inode = bdev->bd_inode; @@ -105,6 +106,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; + if (bdev_read_only(bdev)) + return -EPERM; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@ -112,9 +115,9 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, start = range[0]; len = range[1]; - if (start & 511) + if (!len) return -EINVAL; - if (len & 511) + if ((start | len) & bs_mask) return -EINVAL; if (start + len > bdev_nr_bytes(bdev)) -- cgit v1.2.3 From 81c2168c229bab0665e862937bb476f18cff056d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 May 2024 06:20:24 +0200 Subject: block: add a bio_chain_and_submit helper This is basically blk_next_bio just with the bio allocation moved to the caller to allow for more flexible bio handling in the caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 27 +++++++++++++++++++-------- include/linux/bio.h | 1 + 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 38baedb39c6f..d82ef4fd545c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); -struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, - unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +/** + * bio_chain_and_submit - submit a bio after chaining it to another one + * @prev: bio to chain and submit + * @new: bio to chain to + * + * If @prev is non-NULL, chain it to @new and submit it. + * + * Return: @new. + */ +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { - struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); - - if (bio) { - bio_chain(bio, new); - submit_bio(bio); + if (prev) { + bio_chain(prev, new); + submit_bio(prev); } - return new; } + +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +{ + return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); +} EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9b8a369f44bc..283a0dcbd1de 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -831,5 +831,6 @@ static inline void bio_clear_polled(struct bio *bio) struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); #endif /* __LINUX_BIO_H */ -- cgit v1.2.3 From e8b4869bc78da1a71f2a2ab476caf50c1dcfeed0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 May 2024 06:20:25 +0200 Subject: block: add a blk_alloc_discard_bio helper Factor out a helper from __blkdev_issue_discard that chews off as much as possible from a discard range and allocates a bio for it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 50 +++++++++++++++++++++++++++++--------------------- include/linux/bio.h | 3 +++ 2 files changed, 32 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 6e54ef140bab..442da9dad042 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,31 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +struct bio *blk_alloc_discard_bio(struct block_device *bdev, + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask) { - struct bio *bio = *biop; + sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector)); + struct bio *bio; - while (nr_sects) { - sector_t req_sects = - min(nr_sects, bio_discard_limit(bdev, sector)); + if (!bio_sects) + return NULL; - bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio->bi_iter.bi_size = req_sects << 9; - sector += req_sects; - nr_sects -= req_sects; - - /* - * We can loop for a long time in here, if someone does - * full device discards (like mkfs). Be nice and allow - * us to schedule out to avoid softlocking if preempt - * is disabled. - */ - cond_resched(); - } + bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask); + if (!bio) + return NULL; + bio->bi_iter.bi_sector = *sector; + bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT; + *sector += bio_sects; + *nr_sects -= bio_sects; + /* + * We can loop for a long time in here if someone does full device + * discards (like mkfs). Be nice and allow us to schedule out to avoid + * softlocking if preempt is disabled. + */ + cond_resched(); + return bio; +} - *biop = bio; +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +{ + struct bio *bio; + + while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + gfp_mask))) + *biop = bio_chain_and_submit(*biop, bio); return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); diff --git a/include/linux/bio.h b/include/linux/bio.h index 283a0dcbd1de..d5379548d684 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -833,4 +833,7 @@ struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); +struct bio *blk_alloc_discard_bio(struct block_device *bdev, + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask); + #endif /* __LINUX_BIO_H */ -- cgit v1.2.3 From 0f8e9ecc4636e3abb4f3cf1ead14c94cce7dfde8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 6 May 2024 06:20:26 +0200 Subject: block: add a bio_await_chain helper Add a helper to wait for an entire chain of bios to complete. [hch: split from a larger patch, moved and changed the name now that it is non-static] Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 20 ++++++++++++++++++++ block/blk.h | 1 + 2 files changed, 21 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d82ef4fd545c..dce12a0efdea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1395,6 +1395,26 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +static void bio_wait_end_io(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * bio_await_chain - ends @bio and waits for every chained bio to complete + */ +void bio_await_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio_endio(bio); + blk_wait_io(&done); +} + void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) diff --git a/block/blk.h b/block/blk.h index ee4f782d1496..d5107e65355e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -38,6 +38,7 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); +void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { -- cgit v1.2.3 From 719c15a75ebf3bda3ca718fe8e0ce63d262ec7ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 May 2024 06:20:27 +0200 Subject: blk-lib: check for kill signal in ioctl BLKDISCARD Discards can access a significant capacity and take longer than the user expected. A user may change their mind about wanting to run that command and attempt to kill the process and do something else with their device. But since the task is uninterruptable, they have to wait for it to finish, which could be many hours. Open code blkdev_issue_discard in the BLKDISCARD ioctl handler and check for a fatal signal at each iteration so the user doesn't have to wait for their regretted operation to complete naturally. Heavily based on an earlier patch from Keith Busch. Reported-by: Conrad Meyer Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240506042027.2289826-7-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 49fa02b17ec1..d7a6c6931a1e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -96,9 +96,11 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; - uint64_t range[2]; - uint64_t start, len; struct inode *inode = bdev->bd_inode; + uint64_t range[2], start, len; + struct bio *prev = NULL, *bio; + sector_t sector, nr_sects; + struct blk_plug plug; int err; if (!(mode & BLK_OPEN_WRITE)) @@ -127,7 +129,32 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + + sector = start >> SECTOR_SHIFT; + nr_sects = len >> SECTOR_SHIFT; + + blk_start_plug(&plug); + while (1) { + if (fatal_signal_pending(current)) { + if (prev) + bio_await_chain(prev); + err = -EINTR; + goto out_unplug; + } + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + GFP_KERNEL); + if (!bio) + break; + prev = bio_chain_and_submit(prev, bio); + } + if (prev) { + err = submit_bio_wait(prev); + if (err == -EOPNOTSUPP) + err = 0; + bio_put(prev); + } +out_unplug: + blk_finish_plug(&plug); fail: filemap_invalidate_unlock(inode->i_mapping); return err; -- cgit v1.2.3 From 060406c61c7cb4bbd82a02d179decca9c9bb3443 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 May 2024 20:38:25 +0800 Subject: block: add plug while submitting IO So that if caller didn't use plug, for example, __blkdev_direct_IO_simple() and __blkdev_direct_IO_async(), block layer can still benefit from caching nsec time in the plug. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240509123825.3225207-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 47400a4fe851..8efee8faa4b6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -646,11 +646,13 @@ static void __submit_bio(struct bio *bio) static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; + struct blk_plug plug; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; + blk_start_plug(&plug); do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -684,19 +686,23 @@ static void __submit_bio_noacct(struct bio *bio) bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; + struct blk_plug plug; current->bio_list = bio_list; + blk_start_plug(&plug); do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } -- cgit v1.2.3 From 99dc422335d8b2bd4d105797241d3e715bae90e9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 May 2024 20:37:16 +0800 Subject: block: support to account io_ticks precisely Currently, io_ticks is accounted based on sampling, specifically update_io_ticks() will always account io_ticks by 1 jiffies from bdev_start_io_acct()/blk_account_io_start(), and the result can be inaccurate, for example(HZ is 250): Test script: fio -filename=/dev/sda -bs=4k -rw=write -direct=1 -name=test -thinktime=4ms Test result: util is about 90%, while the disk is really idle. This behaviour is introduced by commit 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting"), however, there was a key point that is missed that this patch also improve performance a lot: Before the commit: part_round_stats: if (part->stamp != now) stats |= 1; part_in_flight() -> there can be lots of task here in 1 jiffies. part_round_stats_single() __part_stat_add() part->stamp = now; After the commit: update_io_ticks: stamp = part->bd_stamp; if (time_after(now, stamp)) if (try_cmpxchg()) __part_stat_add() -> only one task can reach here in 1 jiffies. Hence in order to account io_ticks precisely, we only need to know if there are IO inflight at most once in one jiffies. Noted that for rq-based device, iterating tags should not be used here because 'tags->lock' is grabbed in blk_mq_find_and_get_req(), hence part_stat_lock_inc/dec() and part_in_flight() is used to trace inflight. The additional overhead is quite little: - per cpu add/dec for each IO for rq-based device; - per cpu sum for each jiffies; And it's verified by null-blk that there are no performance degration under heavy IO pressure. Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240509123717.3223892-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++---- block/blk-merge.c | 2 ++ block/blk-mq.c | 4 ++++ block/blk.h | 1 + block/genhd.c | 2 +- 5 files changed, 13 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 8efee8faa4b6..8566bbd8aeba 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -984,10 +984,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end) unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); - } + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && + (end || part_in_flight(part))) + __part_stat_add(part, io_ticks, now - stamp); + if (part->bd_partno) { part = bdev_whole(part); goto again; diff --git a/block/blk-merge.c b/block/blk-merge.c index f64115d72f3d..8534c35e0497 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -780,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req) if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f677ea85a52..8e01e4b32e10 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -996,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now) update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -1018,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req) part_stat_lock(); update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk.h b/block/blk.h index d5107e65355e..3870bdcd5cad 100644 --- a/block/blk.h +++ b/block/blk.h @@ -366,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq) } void update_io_ticks(struct block_device *part, unsigned long now, bool end); +unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { diff --git a/block/genhd.c b/block/genhd.c index dec2ee338fb4..8f1163d2d171 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part, } } -static unsigned int part_in_flight(struct block_device *part) +unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; -- cgit v1.2.3 From 7be835694daebbb4adffbc461519081aa0cf28e1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 May 2024 20:37:17 +0800 Subject: block: fix that util can be greater than 100% util means the percentage that disk has IO, and theoretically it should not be greater than 100%. However, there is a gap for rq-based disk: io_ticks will be updated when rq is allocated, however, before such rq dispatch to driver, it will not be account as inflight from blk_mq_start_request() hence diskstats_show()/part_stat_show() will not update io_ticks. For example: 1) at t0, issue a new IO, rq is allocated, and blk_account_io_start() update io_ticks; 2) something is wrong with drivers, and the rq can't be dispatched; 3) at t0 + 10s, drivers recovers and rq is dispatched and done, io_ticks is updated; Then if user is using "iostat 1" to monitor "util", between t0 - t0+9s, util will be zero, and between t0+9s - t0+10s, util will be 1000%. Fix this problem by updating io_ticks from diskstats_show() and part_stat_show() if there are rq allocated. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240509123717.3223892-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8f1163d2d171..7f39fbe60753 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -951,15 +951,10 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; - if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, bdev); - else - inflight = part_in_flight(bdev); - + inflight = part_in_flight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); @@ -1256,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); - else - inflight = part_in_flight(hd); + inflight = part_in_flight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); -- cgit v1.2.3 From bf20ab538c81bb32edab86f503fc0c55d8243bbc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 May 2024 20:11:06 +0800 Subject: blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW One the one hand, it's marked EXPERIMENTAL since 2017, and looks like there are no users since then, and no testers and no developers, it's just not active at all. On the other hand, even if the config is disabled, there are still many fields in throtl_grp and throtl_data and many functions that are only used for throtl low. At last, currently blk-throtl is initialized during disk initialization, and destroyed during disk removal, and it exposes many functions to be called directly from block layer. Remove throtl low to make code much more cleaner and follow up work much easier. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240509121107.3195568-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 12 - arch/loongarch/configs/loongson3_defconfig | 1 - block/Kconfig | 11 - block/bio.c | 1 - block/blk-stat.c | 3 - block/blk-sysfs.c | 7 - block/blk-throttle.c | 888 ++--------------------------- block/blk-throttle.h | 26 +- block/blk.h | 11 - 9 files changed, 42 insertions(+), 918 deletions(-) (limited to 'block') diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index f0025d1c3d5a..831f19a32e08 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -594,18 +594,6 @@ Description: the data. If no such restriction exists, this file will contain '0'. This file is writable for testing purposes. - -What: /sys/block//queue/throttle_sample_time -Date: March 2017 -Contact: linux-block@vger.kernel.org -Description: - [RW] This is the time window that blk-throttle samples data, in - millisecond. blk-throttle makes decision based on the - samplings. Lower time means cgroups have more smooth throughput, - but higher CPU overhead. This exists only when - CONFIG_BLK_DEV_THROTTLING_LOW is enabled. - - What: /sys/block//queue/virt_boundary_mask Date: April 2021 Contact: linux-block@vger.kernel.org diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index f18c2ba871ef..fc0d89d4c1c5 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -76,7 +76,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y CONFIG_BLK_CGROUP_FC_APPID=y diff --git a/block/Kconfig b/block/Kconfig index d47398ae9824..dc12af58dbae 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -119,17 +119,6 @@ config BLK_DEV_THROTTLING See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. -config BLK_DEV_THROTTLING_LOW - bool "Block throttling .low limit interface support (EXPERIMENTAL)" - depends on BLK_DEV_THROTTLING - help - Add .low limit interface for block throttling. The low limit is a best - effort limit to prioritize cgroups. Depending on the setting, the limit - can be used to protect cgroups in terms of bandwidth/iops and better - utilize disk resource. - - Note, this is an experimental interface and could be changed someday. - config BLK_WBT bool "Enable support for block device writeback throttling" help diff --git a/block/bio.c b/block/bio.c index dce12a0efdea..53f608028c78 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1629,7 +1629,6 @@ again: goto again; } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); if (bio->bi_end_io) diff --git a/block/blk-stat.c b/block/blk-stat.c index e42c263e53fb..eaf60097bbe1 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) - blk_throtl_stat_add(rq, value); - rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e3ed5a921aff..8796c350b33d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); -#endif - /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, @@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = { &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_poll_delay_entry.attr, -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - &blk_throtl_sample_time_entry.attr, -#endif &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, NULL, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c515e1a96fad..d907040859f9 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -25,18 +25,6 @@ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -#define MIN_THROTL_BPS (320 * 1024) -#define MIN_THROTL_IOPS (10) -#define DFL_LATENCY_TARGET (-1L) -#define DFL_IDLE_THRESHOLD (0) -#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ -#define LATENCY_FILTERED_SSD (0) -/* - * For HD, very small latency comes from sequential IO. Such IO is helpless to - * help determine if its IO is impacted by others, hence we ignore the IO - */ -#define LATENCY_FILTERED_HD (1000L) /* 1ms */ /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; @@ -70,19 +58,6 @@ struct throtl_data /* Work for dispatching throttled bios */ struct work_struct dispatch_work; - unsigned int limit_index; - bool limit_valid[LIMIT_CNT]; - - unsigned long low_upgrade_time; - unsigned long low_downgrade_time; - - unsigned int scale; - - struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets[2]; - unsigned long last_calculate_time; - unsigned long filtered_latency; bool track_bio_latency; }; @@ -126,84 +101,24 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } -/* - * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to - * make the IO dispatch more smooth. - * Scale up: linearly scale up according to elapsed time since upgrade. For - * every throtl_slice, the limit scales up 1/2 .low limit till the - * limit hits .max limit - * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit - */ -static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) -{ - /* arbitrary value to avoid too big scale */ - if (td->scale < 4096 && time_after_eq(jiffies, - td->low_upgrade_time + td->scale * td->throtl_slice)) - td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; - - return low + (low >> 1) * td->scale; -} - static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - uint64_t ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; - td = tg->td; - ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) { - /* intermediate node or iops isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->iops[rw][td->limit_index]) - return U64_MAX; - else - return MIN_THROTL_BPS; - } - - if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && - tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); - ret = min(tg->bps[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - unsigned int ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; - td = tg->td; - ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { - /* intermediate node or bps isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->bps[rw][td->limit_index]) - return UINT_MAX; - else - return MIN_THROTL_IOPS; - } - - if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && - tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); - if (adjusted > UINT_MAX) - adjusted = UINT_MAX; - ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->iops[rw]; } #define request_bucket_index(sectors) \ @@ -359,20 +274,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ][LIMIT_MAX] = U64_MAX; - tg->bps[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops[READ][LIMIT_MAX] = UINT_MAX; - tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; - tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; - tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; - tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; - /* LIMIT_LOW will have default value 0 */ - - tg->latency_target = DFL_LATENCY_TARGET; - tg->latency_target_conf = DFL_LATENCY_TARGET; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; + tg->bps[READ] = U64_MAX; + tg->bps[WRITE] = U64_MAX; + tg->iops[READ] = UINT_MAX; + tg->iops[WRITE] = UINT_MAX; return &tg->pd; @@ -418,18 +323,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); - struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || - (td->limit_valid[td->limit_index] && - tg_iops_limit(tg, rw) != UINT_MAX); + tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || - (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX)); + tg_bps_limit(tg, rw) != U64_MAX; } } @@ -443,49 +345,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void blk_throtl_update_limit_valid(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - bool low_valid = false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { - low_valid = true; - break; - } - } - rcu_read_unlock(); - - td->limit_valid[LIMIT_LOW] = low_valid; -} -#else -static inline void blk_throtl_update_limit_valid(struct throtl_data *td) -{ -} -#endif - -static void throtl_upgrade_state(struct throtl_data *td); -static void throtl_pd_offline(struct blkg_policy_data *pd) -{ - struct throtl_grp *tg = pd_to_tg(pd); - - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - - blk_throtl_update_limit_valid(tg->td); - - if (!tg->td->limit_valid[tg->td->limit_index]) - throtl_upgrade_state(tg->td); -} - static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); @@ -1151,8 +1010,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced @@ -1189,9 +1046,6 @@ static void throtl_pending_timer_fn(struct timer_list *t) if (!q->root_blkg) goto out_unlock; - if (throtl_can_upgrade(td, NULL)) - throtl_upgrade_state(td); - again: parent_sq = sq->parent_sq; dispatched = false; @@ -1331,22 +1185,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); - struct throtl_grp *parent_tg; tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; - parent_tg = blkg_to_tg(blkg->parent); - /* - * make sure all children has lower idle time threshold and - * higher latency target - */ - this_tg->idletime_threshold = min(this_tg->idletime_threshold, - parent_tg->idletime_threshold); - this_tg->latency_target = max(this_tg->latency_target, - parent_tg->latency_target); } rcu_read_unlock(); @@ -1444,25 +1288,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1500,55 +1344,36 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, if (!dname) return 0; - if (off == LIMIT_LOW) { - bps_dft = 0; - iops_dft = 0; - } else { - bps_dft = U64_MAX; - iops_dft = UINT_MAX; - } + bps_dft = U64_MAX; + iops_dft = UINT_MAX; - if (tg->bps_conf[READ][off] == bps_dft && - tg->bps_conf[WRITE][off] == bps_dft && - tg->iops_conf[READ][off] == iops_dft && - tg->iops_conf[WRITE][off] == iops_dft && - (off != LIMIT_LOW || - (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && - tg->latency_target_conf == DFL_LATENCY_TARGET))) + if (tg->bps_conf[READ] == bps_dft && + tg->bps_conf[WRITE] == bps_dft && + tg->iops_conf[READ] == iops_dft && + tg->iops_conf[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); - if (tg->bps_conf[READ][off] == U64_MAX) + if (tg->bps_conf[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else - seq_printf(sf, " rbps=%llu", tg->bps_conf[READ][off]); + seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); - if (tg->bps_conf[WRITE][off] == U64_MAX) + if (tg->bps_conf[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else - seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE][off]); + seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); - if (tg->iops_conf[READ][off] == UINT_MAX) + if (tg->iops_conf[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else - seq_printf(sf, " riops=%u", tg->iops_conf[READ][off]); + seq_printf(sf, " riops=%u", tg->iops_conf[READ]); - if (tg->iops_conf[WRITE][off] == UINT_MAX) + if (tg->iops_conf[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else - seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE][off]); + seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); - if (off == LIMIT_LOW) { - if (tg->idletime_threshold_conf == ULONG_MAX) - seq_printf(sf, " idle=max"); - else - seq_printf(sf, " idle=%lu", tg->idletime_threshold_conf); - - if (tg->latency_target_conf == ULONG_MAX) - seq_printf(sf, " latency=max"); - else - seq_printf(sf, " latency=%lu", tg->latency_target_conf); - } seq_printf(sf, "\n"); return 0; } @@ -1567,10 +1392,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; - unsigned long idle_time; - unsigned long latency_time; int ret; - int index = of_cft(of)->private; blkg_conf_init(&ctx, buf); @@ -1581,13 +1403,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); - v[0] = tg->bps_conf[READ][index]; - v[1] = tg->bps_conf[WRITE][index]; - v[2] = tg->iops_conf[READ][index]; - v[3] = tg->iops_conf[WRITE][index]; + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; - idle_time = tg->idletime_threshold_conf; - latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1619,60 +1439,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops") && val > 1) v[3] = min_t(u64, val, UINT_MAX); - else if (off == LIMIT_LOW && !strcmp(tok, "idle")) - idle_time = val; - else if (off == LIMIT_LOW && !strcmp(tok, "latency")) - latency_time = val; else goto out_finish; } - tg->bps_conf[READ][index] = v[0]; - tg->bps_conf[WRITE][index] = v[1]; - tg->iops_conf[READ][index] = v[2]; - tg->iops_conf[WRITE][index] = v[3]; + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; - if (index == LIMIT_MAX) { - tg->bps[READ][index] = v[0]; - tg->bps[WRITE][index] = v[1]; - tg->iops[READ][index] = v[2]; - tg->iops[WRITE][index] = v[3]; - } - tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], - tg->bps_conf[READ][LIMIT_MAX]); - tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], - tg->bps_conf[WRITE][LIMIT_MAX]); - tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], - tg->iops_conf[READ][LIMIT_MAX]); - tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], - tg->iops_conf[WRITE][LIMIT_MAX]); - tg->idletime_threshold_conf = idle_time; - tg->latency_target_conf = latency_time; - - /* force user to configure all settings for low limit */ - if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || - tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || - tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || - tg->latency_target_conf == DFL_LATENCY_TARGET) { - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->latency_target = DFL_LATENCY_TARGET; - } else if (index == LIMIT_LOW) { - tg->idletime_threshold = tg->idletime_threshold_conf; - tg->latency_target = tg->latency_target_conf; - } - - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) { - if (index == LIMIT_LOW) - tg->td->limit_index = LIMIT_LOW; - } else - tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && - tg->td->limit_valid[LIMIT_LOW]); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); @@ -1680,21 +1456,11 @@ out_finish: } static struct cftype throtl_files[] = { -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - { - .name = "low", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_limit, - .write = tg_set_limit, - .private = LIMIT_LOW, - }, -#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, - .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1713,7 +1479,6 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; @@ -1762,418 +1527,6 @@ void blk_throtl_cancel_bios(struct gendisk *disk) spin_unlock_irq(&q->queue_lock); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) -{ - unsigned long rtime = jiffies, wtime = jiffies; - - if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) - rtime = tg->last_low_overflow_time[READ]; - if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) - wtime = tg->last_low_overflow_time[WRITE]; - return min(rtime, wtime); -} - -static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) -{ - struct throtl_service_queue *parent_sq; - struct throtl_grp *parent = tg; - unsigned long ret = __tg_last_low_overflow_time(tg); - - while (true) { - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - if (!parent) - break; - - /* - * The parent doesn't have low limit, it always reaches low - * limit. Its overflow time is useless for children - */ - if (!parent->bps[READ][LIMIT_LOW] && - !parent->iops[READ][LIMIT_LOW] && - !parent->bps[WRITE][LIMIT_LOW] && - !parent->iops[WRITE][LIMIT_LOW]) - continue; - if (time_after(__tg_last_low_overflow_time(parent), ret)) - ret = __tg_last_low_overflow_time(parent); - } - return ret; -} - -static bool throtl_tg_is_idle(struct throtl_grp *tg) -{ - /* - * cgroup is idle if: - * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of idletime threshold - * - average think time is more than threshold - * - IO latency is largely below threshold - */ - unsigned long time; - bool ret; - - time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); - ret = tg->latency_target == DFL_LATENCY_TARGET || - tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (blk_time_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && - tg->bad_bio_cnt * 5 < tg->bio_cnt); - throtl_log(&tg->service_queue, - "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", - tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, - tg->bio_cnt, ret, tg->td->scale); - return ret; -} - -static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) -{ - struct throtl_service_queue *sq = &tg->service_queue; - bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; - - /* - * if low limit is zero, low limit is always reached. - * if low limit is non-zero, we can check if there is any request - * is queued to determine if low limit is reached as we throttle - * request according to limit. - */ - return !limit || sq->nr_queued[rw]; -} - -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) -{ - /* - * cgroup reaches low limit when low limit of READ and WRITE are - * both reached, it's ok to upgrade to next limit if cgroup reaches - * low limit - */ - if (throtl_low_limit_reached(tg, READ) && - throtl_low_limit_reached(tg, WRITE)) - return true; - - if (time_after_eq(jiffies, - tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && - throtl_tg_is_idle(tg)) - return true; - return false; -} - -static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) -{ - while (true) { - if (throtl_tg_can_upgrade(tg)) - return true; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - return false; - } - return false; -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - if (td->limit_index != LIMIT_LOW) - return false; - - if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) - return false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg == this_tg) - continue; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - continue; - if (!throtl_hierarchy_can_upgrade(tg)) { - rcu_read_unlock(); - return false; - } - } - rcu_read_unlock(); - return true; -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_LOW) - return; - - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - tg->last_check_time = now; - - if (!time_after_eq(now, - __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) - return; - - if (throtl_can_upgrade(tg->td, NULL)) - throtl_upgrade_state(tg->td); -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - throtl_log(&td->service_queue, "upgrade to max"); - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->scale = 0; - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - tg->disptime = jiffies - 1; - throtl_select_dispatch(sq); - throtl_schedule_next_dispatch(sq, true); - } - rcu_read_unlock(); - throtl_select_dispatch(&td->service_queue); - throtl_schedule_next_dispatch(&td->service_queue, true); - queue_work(kthrotld_workqueue, &td->dispatch_work); -} - -static void throtl_downgrade_state(struct throtl_data *td) -{ - td->scale /= 2; - - throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); - if (td->scale) { - td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; - return; - } - - td->limit_index = LIMIT_LOW; - td->low_downgrade_time = jiffies; -} - -static bool throtl_tg_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - unsigned long now = jiffies; - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (time_after_eq(now, tg_last_low_overflow_time(tg) + - td->throtl_slice) && - (!throtl_tg_is_idle(tg) || - !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) - return true; - return false; -} - -static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - - if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) - return false; - - while (true) { - if (!throtl_tg_can_downgrade(tg)) - return false; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - break; - } - return true; -} - -static void throtl_downgrade_check(struct throtl_grp *tg) -{ - uint64_t bps; - unsigned int iops; - unsigned long elapsed_time; - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_MAX || - !tg->td->limit_valid[LIMIT_LOW]) - return; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - return; - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - elapsed_time = now - tg->last_check_time; - tg->last_check_time = now; - - if (time_before(now, tg_last_low_overflow_time(tg) + - tg->td->throtl_slice)) - return; - - if (tg->bps[READ][LIMIT_LOW]) { - bps = tg->last_bytes_disp[READ] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->bps[WRITE][LIMIT_LOW]) { - bps = tg->last_bytes_disp[WRITE] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - if (tg->iops[READ][LIMIT_LOW]) { - iops = tg->last_io_disp[READ] * HZ / elapsed_time; - if (iops >= tg->iops[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->iops[WRITE][LIMIT_LOW]) { - iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; - if (iops >= tg->iops[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (throtl_hierarchy_can_downgrade(tg)) - throtl_downgrade_state(tg->td); - - tg->last_bytes_disp[READ] = 0; - tg->last_bytes_disp[WRITE] = 0; - tg->last_io_disp[READ] = 0; - tg->last_io_disp[WRITE] = 0; -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ - unsigned long now; - unsigned long last_finish_time = tg->last_finish_time; - - if (last_finish_time == 0) - return; - - now = blk_time_get_ns() >> 10; - if (now <= last_finish_time || - last_finish_time == tg->checked_last_finish_time) - return; - - tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; - tg->checked_last_finish_time = last_finish_time; -} - -static void throtl_update_latency_buckets(struct throtl_data *td) -{ - struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; - int i, cpu, rw; - unsigned long last_latency[2] = { 0 }; - unsigned long latency[2]; - - if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) - return; - if (time_before(jiffies, td->last_calculate_time + HZ)) - return; - td->last_calculate_time = jiffies; - - memset(avg_latency, 0, sizeof(avg_latency)); - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets[rw], - cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } - - if (tmp->samples >= 32) { - int samples = tmp->samples; - - latency[rw] = tmp->total_latency; - - tmp->total_latency = 0; - tmp->samples = 0; - latency[rw] /= samples; - if (latency[rw] == 0) - continue; - avg_latency[rw][i].latency = latency[rw]; - } - } - } - - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[rw][i].latency) { - if (td->avg_buckets[rw][i].latency < last_latency[rw]) - td->avg_buckets[rw][i].latency = - last_latency[rw]; - continue; - } - - if (!td->avg_buckets[rw][i].valid) - latency[rw] = avg_latency[rw][i].latency; - else - latency[rw] = (td->avg_buckets[rw][i].latency * 7 + - avg_latency[rw][i].latency) >> 3; - - td->avg_buckets[rw][i].latency = max(latency[rw], - last_latency[rw]); - td->avg_buckets[rw][i].valid = true; - last_latency[rw] = td->avg_buckets[rw][i].latency; - } - } - - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - throtl_log(&td->service_queue, - "Latency bucket %d: read latency=%ld, read valid=%d, " - "write latency=%ld, write valid=%d", i, - td->avg_buckets[READ][i].latency, - td->avg_buckets[READ][i].valid, - td->avg_buckets[WRITE][i].latency, - td->avg_buckets[WRITE][i].valid); -} -#else -static inline void throtl_update_latency_buckets(struct throtl_data *td) -{ -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ -} - -static void throtl_downgrade_check(struct throtl_grp *tg) -{ -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - return false; -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ -} -#endif - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -2186,21 +1539,12 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); - spin_lock_irq(&q->queue_lock); - - throtl_update_latency_buckets(td); - - blk_throtl_update_idletime(tg); - sq = &tg->service_queue; -again: while (true) { if (tg->last_low_overflow_time[rw] == 0) tg->last_low_overflow_time[rw] = jiffies; - throtl_downgrade_check(tg); - throtl_upgrade_check(tg); /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; @@ -2208,10 +1552,6 @@ again: /* if above limits, break to queue */ if (!tg_may_dispatch(tg, bio, NULL)) { tg->last_low_overflow_time[rw] = jiffies; - if (throtl_can_upgrade(td, tg)) { - throtl_upgrade_state(td); - goto again; - } break; } @@ -2271,101 +1611,12 @@ again: } out_unlock: -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (throttled || !td->track_bio_latency) - bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; -#endif spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void throtl_track_latency(struct throtl_data *td, sector_t size, - enum req_op op, unsigned long time) -{ - const bool rw = op_is_write(op); - struct latency_bucket *latency; - int index; - - if (!td || td->limit_index != LIMIT_LOW || - !(op == REQ_OP_READ || op == REQ_OP_WRITE) || - !blk_queue_nonrot(td->queue)) - return; - - index = request_bucket_index(size); - - latency = get_cpu_ptr(td->latency_buckets[rw]); - latency[index].total_latency += time; - latency[index].samples++; - put_cpu_ptr(td->latency_buckets[rw]); -} - -void blk_throtl_stat_add(struct request *rq, u64 time_ns) -{ - struct request_queue *q = rq->q; - struct throtl_data *td = q->td; - - throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), - time_ns >> 10); -} - -void blk_throtl_bio_endio(struct bio *bio) -{ - struct blkcg_gq *blkg; - struct throtl_grp *tg; - u64 finish_time_ns; - unsigned long finish_time; - unsigned long start_time; - unsigned long lat; - int rw = bio_data_dir(bio); - - blkg = bio->bi_blkg; - if (!blkg) - return; - tg = blkg_to_tg(blkg); - if (!tg->td->limit_valid[LIMIT_LOW]) - return; - - finish_time_ns = blk_time_get_ns(); - tg->last_finish_time = finish_time_ns >> 10; - - start_time = bio_issue_time(&bio->bi_issue) >> 10; - finish_time = __bio_issue_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) - return; - - lat = finish_time - start_time; - /* this is only for bio based driver */ - if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) - throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), - bio_op(bio), lat); - - if (tg->latency_target && lat >= tg->td->filtered_latency) { - int bucket; - unsigned int threshold; - - bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); - threshold = tg->td->avg_buckets[rw][bucket].latency + - tg->latency_target; - if (lat > threshold) - tg->bad_bio_cnt++; - /* - * Not race free, could get wrong count, which means cgroups - * will be throttled - */ - tg->bio_cnt++; - } - - if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { - tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; - tg->bio_cnt /= 2; - tg->bad_bio_cnt /= 2; - } -} -#endif - int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -2375,19 +1626,6 @@ int blk_throtl_init(struct gendisk *disk) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[READ]) { - kfree(td); - return -ENOMEM; - } - td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[WRITE]) { - free_percpu(td->latency_buckets[READ]); - kfree(td); - return -ENOMEM; - } INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); @@ -2395,18 +1633,10 @@ int blk_throtl_init(struct gendisk *disk) q->td = td; td->queue = q; - td->limit_valid[LIMIT_MAX] = true; - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->low_downgrade_time = jiffies; - /* activate policy */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); - if (ret) { - free_percpu(td->latency_buckets[READ]); - free_percpu(td->latency_buckets[WRITE]); + if (ret) kfree(td); - } return ret; } @@ -2418,8 +1648,6 @@ void blk_throtl_exit(struct gendisk *disk) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets[READ]); - free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } @@ -2427,58 +1655,18 @@ void blk_throtl_register(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; - int i; td = q->td; BUG_ON(!td); - if (blk_queue_nonrot(q)) { + if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->filtered_latency = LATENCY_FILTERED_SSD; - } else { + else td->throtl_slice = DFL_THROTL_SLICE_HD; - td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; - td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; - } - } -#ifndef CONFIG_BLK_DEV_THROTTLING_LOW - /* if no low limit, use previous default */ - td->throtl_slice = DFL_THROTL_SLICE_HD; - -#else td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); -#endif -} - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) -{ - if (!q->td) - return -EINVAL; - return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); -} - -ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count) -{ - unsigned long v; - unsigned long t; - - if (!q->td) - return -EINVAL; - if (kstrtoul(page, 10, &v)) - return -EINVAL; - t = msecs_to_jiffies(v); - if (t == 0 || t > MAX_THROTL_SLICE) - return -EINVAL; - q->td->throtl_slice = t; - return count; } -#endif static int __init throtl_init(void) { diff --git a/block/blk-throttle.h b/block/blk-throttle.h index bffbc9cfc8ab..32503fd83a84 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -58,12 +58,6 @@ enum tg_state_flags { THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -102,14 +96,14 @@ struct throtl_grp { bool has_rules_iops[2]; /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; + uint64_t bps[2]; /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; + uint64_t bps_conf[2]; /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; + unsigned int iops[2]; /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; + unsigned int iops_conf[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; @@ -132,22 +126,10 @@ struct throtl_grp { unsigned long last_check_time; - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; diff --git a/block/blk.h b/block/blk.h index 3870bdcd5cad..6e94c10af798 100644 --- a/block/blk.h +++ b/block/blk.h @@ -388,17 +388,6 @@ static inline void ioc_clear_queue(struct request_queue *q) } #endif /* CONFIG_BLK_ICQ */ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); -extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count); -extern void blk_throtl_bio_endio(struct bio *bio); -extern void blk_throtl_stat_add(struct request *rq, u64 time); -#else -static inline void blk_throtl_bio_endio(struct bio *bio) { } -static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } -#endif - struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) -- cgit v1.2.3 From a3166c51702bb00b8f8b84022090cbab8f37be1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 May 2024 20:11:07 +0800 Subject: blk-throttle: delay initialization until configuration Other cgroup policy like bfq, iocost are lazy-initialized when they are configured for the first time for the device, but blk-throttle is initialized unconditionally from blkcg_init_disk(). Delay initialization of blk-throttle as well, to save some cpu and memory overhead if it's not configured. Noted that once it's initialized, it can't be destroyed until disk removal, even if it's disabled. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240509121107.3195568-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 --- block/blk-sysfs.c | 1 - block/blk-throttle.c | 114 +++++++++++++++++++++++++++++++++------------------ block/blk-throttle.h | 20 ++++++--- 4 files changed, 88 insertions(+), 53 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8598e4591e79..5e1f10525677 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1440,14 +1440,8 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_destroy_all; - ret = blk_throtl_init(disk); - if (ret) - goto err_ioprio_exit; - return 0; -err_ioprio_exit: - blk_ioprio_exit(disk); err_destroy_all: blkg_destroy_all(disk); return ret; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8796c350b33d..f0f9314ab65c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -807,7 +807,6 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); - blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d907040859f9..80aaca18bfb0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1211,6 +1211,53 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) } } +static int blk_throtl_init(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct throtl_data *td; + int ret; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); + throtl_service_queue_init(&td->service_queue); + + /* + * Freeze queue before activating policy, to synchronize with IO path, + * which is protected by 'q_usage_counter'. + */ + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + + q->td = td; + td->queue = q; + + /* activate policy */ + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); + if (ret) { + q->td = NULL; + kfree(td); + goto out; + } + + if (blk_queue_nonrot(q)) + td->throtl_slice = DFL_THROTL_SLICE_SSD; + else + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->track_bio_latency = !queue_is_mq(q); + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + +out: + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + + return ret; +} + + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1222,6 +1269,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1396,6 +1453,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1488,6 +1555,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + if (!blk_throtl_activated(q)) + return; + spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. @@ -1617,57 +1687,19 @@ out_unlock: return throttled; } -int blk_throtl_init(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - int ret; - - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); - if (!td) - return -ENOMEM; - - INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue); - - q->td = td; - td->queue = q; - - /* activate policy */ - ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); - if (ret) - kfree(td); - return ret; -} - void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - BUG_ON(!q->td); + if (!blk_throtl_activated(q)) + return; + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } -void blk_throtl_register(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - - td = q->td; - BUG_ON(!td); - - if (blk_queue_nonrot(q)) - td->throtl_slice = DFL_THROTL_SLICE_SSD; - else - td->throtl_slice = DFL_THROTL_SLICE_HD; - td->track_bio_latency = !queue_is_mq(q); - if (!td->track_bio_latency) - blk_stat_enable_accounting(q); -} - static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 32503fd83a84..393c3d134b96 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -150,23 +150,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct gendisk *disk) { return 0; } static inline void blk_throtl_exit(struct gendisk *disk) { } -static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct gendisk *disk); void blk_throtl_exit(struct gendisk *disk); -void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct gendisk *disk); +static inline bool blk_throtl_activated(struct request_queue *q) +{ + return q->td != NULL; +} + static inline bool blk_should_throtl(struct bio *bio) { - struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + struct throtl_grp *tg; int rw = bio_data_dir(bio); + /* + * This is called under bio_queue_enter(), and it's synchronized with + * the activation of blk-throtl, which is protected by + * blk_mq_freeze_queue(). + */ + if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) + return false; + + tg = blkg_to_tg(bio->bi_blkg); if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); -- cgit v1.2.3