From 3af3d772f7216cf23081bb4176e86f1219d32ebc Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 13 Mar 2021 11:01:45 +0800 Subject: block_dump: remove block_dump feature We have already delete block_dump feature in mark_inode_dirty() because it can be replaced by tracepoints, now we also remove the part in submit_bio() for the same reason. The part of block dump feature in submit_bio() dump the write process, write region and sectors on the target disk into kernel message. it can be replaced by block_bio_queue tracepoint in submit_bio_checks(), so we do not need block_dump anymore, remove the whole block_dump feature. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210313030146.2882027-3-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-core.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9bcdae93f6d4..689aac2625d2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1086,15 +1086,6 @@ blk_qc_t submit_bio(struct bio *bio) task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } - - if (unlikely(block_dump)) { - char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", - current->comm, task_pid_nr(current), - op_is_write(bio_op(bio)) ? "WRITE" : "READ", - (unsigned long long)bio->bi_iter.bi_sector, - bio_devname(bio, b), count); - } } /* -- cgit v1.2.3 From b5f3352e0868611b555e1dcb2e1ffb8e346c519c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 May 2021 14:58:04 -0400 Subject: blkcg: drop CLONE_IO check in blkcg_can_attach() blkcg has always rejected to attach if any of the member tasks has shared io_context. The rationale was that io_contexts can be shared across different cgroups making it impossible to define what the appropriate control behavior should be. However, this check causes more problems than it solves: * The check prevents controller enable and migrations but not CLONE_IO itself, which can lead to surprises as the outcome changes depending on the order of operations. * Sharing within a cgroup is fine but the check can't distinguish that. This leads to unnecessary conflicts with the recent CLONE_IO usage in io_uring. io_context sharing doesn't make any difference for rq_qos based controllers and the way it's used is safe as long as tasks aren't migrated dynamically which is the vast majority of use cases. While we can try to make the check more precise to avoid false positives, the added complexity doesn't seem worthwhile. Let's just drop blkcg_can_attach(). Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/YJrTvHbrRDbJjw+S@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 582d2f18717e..d169e2055158 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1217,32 +1217,6 @@ void blkcg_exit_queue(struct request_queue *q) blk_throtl_exit(q); } -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. - */ -static int blkcg_can_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *dst_css; - struct io_context *ioc; - int ret = 0; - - /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, dst_css, tset) { - task_lock(task); - ioc = task->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(task); - if (ret) - break; - } - return ret; -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1275,7 +1249,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, - .can_attach = blkcg_can_attach, .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, -- cgit v1.2.3 From 8c390ff910c5500fc16cca6f90ac2a60c7c84979 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 May 2021 15:53:19 +0000 Subject: block: remove unneeded parenthesis from blk-sysfs Align to common code conventions. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20210511155319.1885277-1-mgurtovoy@nvidia.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e03bedf180ab..f89e2fc3963b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -91,7 +91,7 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page) unsigned long ra_kb = q->backing_dev_info->ra_pages << (PAGE_SHIFT - 10); - return queue_var_show(ra_kb, (page)); + return queue_var_show(ra_kb, page); } static ssize_t @@ -112,28 +112,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) { int max_sectors_kb = queue_max_sectors(q) >> 1; - return queue_var_show(max_sectors_kb, (page)); + return queue_var_show(max_sectors_kb, page); } static ssize_t queue_max_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segments(q), (page)); + return queue_var_show(queue_max_segments(q), page); } static ssize_t queue_max_discard_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_discard_segments(q), (page)); + return queue_var_show(queue_max_discard_segments(q), page); } static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_integrity_segments, (page)); + return queue_var_show(q->limits.max_integrity_segments, page); } static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segment_size(q), (page)); + return queue_var_show(queue_max_segment_size(q), page); } static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) @@ -261,12 +261,12 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) { int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - return queue_var_show(max_hw_sectors_kb, (page)); + return queue_var_show(max_hw_sectors_kb, page); } static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.virt_boundary_mask, (page)); + return queue_var_show(q->limits.virt_boundary_mask, page); } #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ -- cgit v1.2.3 From 84da7acc3ba53af26f15c4b0ada446127b7a7836 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:33 +0800 Subject: block: avoid double io accounting for flush request For flush request, rq->end_io() may be called two times, one is from timeout handling(blk_mq_check_expired()), another is from normal completion(__blk_mq_end_request()). Move blk_account_io_flush() after flush_rq->ref drops to zero, so io accounting can be done just once for flush request. Fixes: b68663186577 ("block: add iostat counters for flush requests") Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: John Garry Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 7942ca6ed321..1002f6c58181 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -219,8 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - blk_account_io_flush(flush_rq); - /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -230,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) return; } + blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for -- cgit v1.2.3 From 2e315dc07df009c3e29d6926871f62a30cfae394 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:34 +0800 Subject: blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter Grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter(), and this way will prevent the request from being re-used when ->fn is running. The approach is same as what we do during handling timeout. Fix request use-after-free(UAF) related with completion race or queue releasing: - If one rq is referred before rq->q is frozen, then queue won't be frozen before the request is released during iteration. - If one rq is referred after rq->q is frozen, refcount_inc_not_zero() will return false, and we won't iterate over this request. However, still one request UAF not covered: refcount_inc_not_zero() may read one freed request, and it will be handled in next patch. Tested-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 44 +++++++++++++++++++++++++++++++++----------- block/blk-mq.c | 14 +++++++++----- block/blk-mq.h | 1 + 3 files changed, 43 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2a37731e8244..544edf2c56a5 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -199,6 +199,16 @@ struct bt_iter_data { bool reserved; }; +static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, + unsigned int bitnr) +{ + struct request *rq = tags->rqs[bitnr]; + + if (!rq || !refcount_inc_not_zero(&rq->ref)) + return NULL; + return rq; +} + static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; @@ -206,18 +216,22 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) struct blk_mq_tags *tags = hctx->tags; bool reserved = iter_data->reserved; struct request *rq; + bool ret = true; if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; - /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx) - return iter_data->fn(hctx, rq, iter_data->data, reserved); - return true; + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (rq->q == hctx->queue && rq->mq_hctx == hctx) + ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + blk_mq_put_rq_ref(rq); + return ret; } /** @@ -264,6 +278,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) struct blk_mq_tags *tags = iter_data->tags; bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; struct request *rq; + bool ret = true; + bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!reserved) bitnr += tags->nr_reserved_tags; @@ -272,16 +288,19 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) + if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else - rq = tags->rqs[bitnr]; + rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; - if ((iter_data->flags & BT_TAG_ITER_STARTED) && - !blk_mq_request_started(rq)) - return true; - return iter_data->fn(rq, iter_data->data, reserved); + + if (!(iter_data->flags & BT_TAG_ITER_STARTED) || + blk_mq_request_started(rq)) + ret = iter_data->fn(rq, iter_data->data, reserved); + if (!iter_static_rqs) + blk_mq_put_rq_ref(rq); + return ret; } /** @@ -348,6 +367,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. + * + * We grab one request reference before calling @fn and release it after + * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) diff --git a/block/blk-mq.c b/block/blk-mq.c index c86c01bfecdb..debfa5cd8025 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } +void blk_mq_put_rq_ref(struct request *rq) +{ + if (is_flush_rq(rq, rq->mq_hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); - + blk_mq_put_rq_ref(rq); return true; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 9ce64bc4a6c8..556368d2c5b6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -47,6 +47,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); +void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map -- cgit v1.2.3 From bd63141d585bef14f4caf111f6d0e27fe2300ec6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:35 +0800 Subject: blk-mq: clear stale request in tags->rq[] before freeing one request pool refcount_inc_not_zero() in bt_tags_iter() still may read one freed request. Fix the issue by the following approach: 1) hold a per-tags spinlock when reading ->rqs[tag] and calling refcount_inc_not_zero in bt_tags_iter() 2) clearing stale request referred via ->rqs[tag] before freeing request pool, the per-tags spinlock is held for clearing stale ->rq[tag] So after we cleared stale requests, bt_tags_iter() won't observe freed request any more, also the clearing will wait for pending request reference. The idea of clearing ->rqs[] is borrowed from John Garry's previous patch and one recent David's patch. Tested-by: John Garry Reviewed-by: David Jeffery Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 9 +++++++-- block/blk-mq-tag.h | 6 ++++++ block/blk-mq.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 544edf2c56a5..1671dae43030 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -202,10 +202,14 @@ struct bt_iter_data { static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { - struct request *rq = tags->rqs[bitnr]; + struct request *rq; + unsigned long flags; + spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; if (!rq || !refcount_inc_not_zero(&rq->ref)) - return NULL; + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -538,6 +542,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); if (blk_mq_is_sbitmap_shared(flags)) return tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7d3e6b333a4a..f887988e5ef6 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -20,6 +20,12 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index debfa5cd8025..dd371f321d35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2307,6 +2307,45 @@ queue_exit: return BLK_QC_T_NONE; } +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -2325,6 +2364,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2384,11 +2425,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { -- cgit v1.2.3 From 364b61818f65045479e42e76ed8dd6f051778280 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:36 +0800 Subject: blk-mq: clearing flush request reference in tags->rqs[] Before we free request queue, clearing flush request reference in tags->rqs[], so that potential UAF can be avoided. Based on one patch written by David Jeffery. Tested-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: David Jeffery Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index dd371f321d35..fbb165393790 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2643,16 +2643,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); } +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); -- cgit v1.2.3 From 56b68085e536eff2676108f2f8356889a7dbbf55 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:57 +0800 Subject: blk-mq: Some tag allocation code refactoring The tag allocation code to alloc the sbitmap pairs is common for regular bitmaps tags and shared sbitmap, so refactor into a common function. Also remove superfluous "flags" argument from blk_mq_init_shared_sbitmap(). Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-2-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 54 +++++++++++++++++++++++++++++++++--------------------- block/blk-mq-tag.h | 9 ++++++--- block/blk-mq.c | 2 +- 3 files changed, 40 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1671dae43030..f597d40de10b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,39 +471,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) + if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; - if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, - round_robin, node)) + if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; + return 0; + +free_bitmap_tags: + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; +} + +static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + int ret; + + ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, + &tags->__breserved_tags, + tags->nr_tags, tags->nr_reserved_tags, + node, alloc_policy); + if (ret) + return ret; + tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags; return 0; -free_bitmap_tags: - sbitmap_queue_free(&tags->__bitmap_tags); - return -ENOMEM; } -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) { - unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - int i, node = set->numa_node; + int i, ret; - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&set->__breserved_tags, set->reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, + set->queue_depth, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; @@ -513,9 +528,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) } return 0; -free_bitmap_tags: - sbitmap_queue_free(&set->__bitmap_tags); - return -ENOMEM; } void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index f887988e5ef6..8ed55af08427 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -32,11 +32,14 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); +extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, + unsigned int reserved, + int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, - unsigned int flags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); - extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index fbb165393790..001e196bdebd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3564,7 +3564,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); - if (blk_mq_init_shared_sbitmap(set, set->flags)) { + if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; goto out_free_mq_rq_maps; } -- cgit v1.2.3 From d97e594c51660bea510a387731637b894651e4b5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:58 +0800 Subject: blk-mq: Use request queue-wide tags for tagset-wide sbitmap The tags used for an IO scheduler are currently per hctx. As such, when q->nr_hw_queues grows, so does the request queue total IO scheduler tag depth. This may cause problems for SCSI MQ HBAs whose total driver depth is fixed. Ming and Yanhui report higher CPU usage and lower throughput in scenarios where the fixed total driver tag depth is appreciably lower than the total scheduler tag depth: https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b In that scenario, since the scheduler tag is got first, much contention is introduced since a driver tag may not be available after we have got the sched tag. Improve this scenario by introducing request queue-wide tags for when a tagset-wide sbitmap is used. The static sched requests are still allocated per hctx, as requests are initialised per hctx, as in blk_mq_init_request(..., hctx_idx, ...) -> set->ops->init_request(.., hctx_idx, ...). For simplicity of resizing the request queue sbitmap when updating the request queue depth, just init at the max possible size, so we don't need to deal with the possibly with swapping out a new sbitmap for old if we need to grow. Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 67 ++++++++++++++++++++++++++++++++++++++++---------- block/blk-mq-sched.h | 2 ++ block/blk-mq-tag.c | 11 ++++----- block/blk-mq.c | 13 ++++++++-- include/linux/blkdev.h | 4 +++ 5 files changed, 76 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 996a4b2f73aa..045b6878b8c5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -509,11 +509,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, set->flags); hctx->sched_tags = NULL; } } @@ -523,12 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; - /* Clear HCTX_SHARED so tags are init'ed */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, flags); + set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; @@ -546,16 +542,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - /* Clear HCTX_SHARED so tags are freed */ - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } } +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +{ + struct blk_mq_tag_set *set = queue->tag_set; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, + MAX_SCHED_RQ, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; + + queue_for_each_hw_ctx(queue, hctx, i) { + hctx->sched_tags->bitmap_tags = + &queue->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &queue->sched_breserved_tags; + } + + sbitmap_queue_resize(&queue->sched_bitmap_tags, + queue->nr_requests - set->reserved_tags); + + return 0; +} + +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); +} + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; @@ -580,12 +610,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) - goto err; + goto err_free_tags; + } + + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); + if (ret) + goto err_free_tags; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_sbitmap; blk_mq_debugfs_register_sched(q); @@ -605,7 +641,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err: +err_free_sbitmap: + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); +err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; @@ -643,5 +682,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5b18ab915c65..aff037cfd8e7 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,6 +5,8 @@ #include "blk-mq.h" #include "blk-mq-tag.h" +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) + void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f597d40de10b..86f87346232a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" /* @@ -590,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; - /* Only sched tags can grow, so clear HCTX_SHARED flag */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; @@ -602,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, flags); + tags->nr_reserved_tags, set->flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new, flags); + blk_mq_free_rq_map(new, set->flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, flags); + blk_mq_free_rq_map(*tagsptr, set->flags); *tagsptr = new; } else { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 001e196bdebd..f11d4018ce2e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3640,15 +3640,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); + if (blk_mq_is_sbitmap_shared(set->flags)) { + hctx->sched_tags->bitmap_tags = + &q->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &q->sched_breserved_tags; + } } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f69c75bd6d27..2c28577b50f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -493,6 +494,9 @@ struct request_queue { atomic_t nr_active_requests_shared_sbitmap; + struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); -- cgit v1.2.3 From 7c3f828b522b07adb341b08fde1660685c5ba3eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:51 +0200 Subject: block: refactor device number setup in __device_add_disk Untangle the mess around blk_alloc_devt by moving the check for the used allocation scheme into the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 +-- block/genhd.c | 96 +++++++++++++++++++------------------------------ block/partitions/core.c | 15 +++++--- 3 files changed, 49 insertions(+), 66 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 8b3591aee0a5..cba3a94aabfa 100644 --- a/block/blk.h +++ b/block/blk.h @@ -343,8 +343,8 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif -int blk_alloc_devt(struct block_device *part, dev_t *devt); -void blk_free_devt(dev_t devt); +int blk_alloc_ext_minor(void); +void blk_free_ext_minor(unsigned int minor); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 diff --git a/block/genhd.c b/block/genhd.c index 9f8cb7beaad1..3daab80201df 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -333,52 +333,22 @@ static int blk_mangle_minor(int minor) return minor; } -/** - * blk_alloc_devt - allocate a dev_t for a block device - * @bdev: block device to allocate dev_t for - * @devt: out parameter for resulting dev_t - * - * Allocate a dev_t for block device. - * - * RETURNS: - * 0 on success, allocated dev_t is returned in *@devt. -errno on - * failure. - * - * CONTEXT: - * Might sleep. - */ -int blk_alloc_devt(struct block_device *bdev, dev_t *devt) +int blk_alloc_ext_minor(void) { - struct gendisk *disk = bdev->bd_disk; int idx; - /* in consecutive minor range? */ - if (bdev->bd_partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); - return 0; - } - idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) - return idx == -ENOSPC ? -EBUSY : idx; - - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); - return 0; + if (idx < 0) { + if (idx == -ENOSPC) + return -EBUSY; + return idx; + } + return blk_mangle_minor(idx); } -/** - * blk_free_devt - free a dev_t - * @devt: dev_t to free - * - * Free @devt which was allocated using blk_alloc_devt(). - * - * CONTEXT: - * Might sleep. - */ -void blk_free_devt(dev_t devt) +void blk_free_ext_minor(unsigned int minor) { - if (MAJOR(devt) == BLOCK_EXT_MAJOR) - ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); + ida_free(&ext_devt_ida, blk_mangle_minor(minor)); } static char *bdevt_str(dev_t devt, char *buf) @@ -499,8 +469,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, bool register_queue) { - dev_t devt; - int retval; + int ret; /* * The disk queue should now be all set with enough information about @@ -511,23 +480,29 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, if (register_queue) elevator_init_mq(disk->queue); - /* minors == 0 indicates to use ext devt from part0 and should - * be accompanied with EXT_DEVT flag. Make sure all - * parameters make sense. + /* + * If the driver provides an explicit major number it also must provide + * the number of minors numbers supported, and those will be used to + * setup the gendisk. + * Otherwise just allocate the device numbers for both the whole device + * and all partitions from the extended dev_t space. */ - WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && - !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - - disk->flags |= GENHD_FL_UP; + if (disk->major) { + WARN_ON(!disk->minors); + } else { + WARN_ON(disk->minors); + WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - retval = blk_alloc_devt(disk->part0, &devt); - if (retval) { - WARN_ON(1); - return; + ret = blk_alloc_ext_minor(); + if (ret < 0) { + WARN_ON(1); + return; + } + disk->major = BLOCK_EXT_MAJOR; + disk->first_minor = MINOR(ret); } - disk->major = MAJOR(devt); - disk->first_minor = MINOR(devt); + + disk->flags |= GENHD_FL_UP; disk_alloc_events(disk); @@ -541,14 +516,14 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } else { struct backing_dev_info *bdi = disk->queue->backing_dev_info; struct device *dev = disk_to_dev(disk); - int ret; /* Register BDI before referencing it from bdev */ - dev->devt = devt; - ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); + dev->devt = MKDEV(disk->major, disk->first_minor); + ret = bdi_register(bdi, "%u:%u", + disk->major, disk->first_minor); WARN_ON(ret); bdi_set_owner(bdi, dev); - bdev_add(disk->part0, devt); + bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); if (register_queue) @@ -1120,7 +1095,8 @@ static void disk_release(struct device *dev) might_sleep(); - blk_free_devt(dev->devt); + if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/block/partitions/core.c b/block/partitions/core.c index dc60ecf46fe6..504297bdc8bf 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -260,7 +260,8 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - blk_free_devt(dev->devt); + if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(dev->devt)); bdput(dev_to_bdev(dev)); } @@ -379,9 +380,15 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(bdev, &devt); - if (err) - goto out_put; + /* in consecutive minor range? */ + if (bdev->bd_partno < disk->minors) { + devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); + } else { + err = blk_alloc_ext_minor(); + if (err < 0) + goto out_put; + devt = MKDEV(BLOCK_EXT_MAJOR, err); + } pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ -- cgit v1.2.3 From 2e3c73fa0c419f62fd588731be30fb0d1bca9ad6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:52 +0200 Subject: block: move the DISK_MAX_PARTS sanity check into __device_add_disk Keep this together with the first place that actually looks at ->minors and prepare for not passing a minors argument to alloc_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 3daab80201df..8c1816d2929e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -489,6 +489,12 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, */ if (disk->major) { WARN_ON(!disk->minors); + + if (disk->minors > DISK_MAX_PARTS) { + pr_err("block: can't allocate more than %d partitions\n", + DISK_MAX_PARTS); + disk->minors = DISK_MAX_PARTS; + } } else { WARN_ON(disk->minors); WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); @@ -1255,13 +1261,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - if (minors > DISK_MAX_PARTS) { - printk(KERN_ERR - "block: can't allocate more than %d partitions\n", - DISK_MAX_PARTS); - minors = DISK_MAX_PARTS; - } - disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; -- cgit v1.2.3 From 0d1feb72ffd8578f6f167ca15b2096c276c1f6df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:53 +0200 Subject: block: automatically enable GENHD_FL_EXT_DEVT Automatically set the GENHD_FL_EXT_DEVT flag for all disks allocated without an explicit number of minors. This is what all new block drivers should do, so make sure it is the default without boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- block/partitions/core.c | 4 ---- drivers/block/n64cart.c | 2 +- drivers/lightnvm/core.c | 1 - drivers/memstick/core/ms_block.c | 1 - drivers/nvdimm/blk.c | 1 - drivers/nvdimm/btt.c | 1 - drivers/nvdimm/pmem.c | 1 - drivers/nvme/host/core.c | 1 - drivers/nvme/host/multipath.c | 1 - 10 files changed, 2 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8c1816d2929e..9fa734cb9cbd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -497,7 +497,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } } else { WARN_ON(disk->minors); - WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); ret = blk_alloc_ext_minor(); if (ret < 0) { @@ -506,6 +505,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } disk->major = BLOCK_EXT_MAJOR; disk->first_minor = MINOR(ret); + disk->flags |= GENHD_FL_EXT_DEVT; } disk->flags |= GENHD_FL_UP; diff --git a/block/partitions/core.c b/block/partitions/core.c index 504297bdc8bf..ada3e1e66989 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -326,10 +326,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; - /* - * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set - * or 'minors' is passed to alloc_disk(). - */ if (partno >= disk_max_parts(disk)) return ERR_PTR(-EINVAL); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 47bdf324e962..3dae4b631dea 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -141,7 +141,7 @@ static int __init n64cart_probe(struct platform_device *pdev) return -ENOMEM; disk->first_minor = 0; - disk->flags = GENHD_FL_NO_PART_SCAN | GENHD_FL_EXT_DEVT; + disk->flags = GENHD_FL_NO_PART_SCAN; disk->fops = &n64cart_fops; disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 40a948c08a0b..e7dc539fc0ac 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -383,7 +383,6 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) } strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); - tdisk->flags = GENHD_FL_EXT_DEVT; tdisk->major = 0; tdisk->first_minor = 0; tdisk->fops = tt->bops; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 8004dd64d09a..0bacf4268f83 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2136,7 +2136,6 @@ static int msb_init_disk(struct memstick_dev *card) msb->disk->fops = &msb_bdops; msb->disk->private_data = msb; msb->disk->queue = msb->queue; - msb->disk->flags |= GENHD_FL_EXT_DEVT; capacity = msb->pages_in_block * msb->logical_block_count; capacity *= (msb->page_size / 512); diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 7b9556291eb1..7ba446d224fb 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -267,7 +267,6 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) disk->first_minor = 0; disk->fops = &nd_blk_fops; disk->queue = q; - disk->flags = GENHD_FL_EXT_DEVT; disk->private_data = nsblk; nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 18a267d5073f..1741a7b0b30f 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1537,7 +1537,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; btt->btt_disk->queue = btt->btt_queue; - btt->btt_disk->flags = GENHD_FL_EXT_DEVT; blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ed10a8b66068..968b8483c763 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -477,7 +477,6 @@ static int pmem_attach_disk(struct device *dev, disk->fops = &pmem_fops; disk->queue = q; - disk->flags = GENHD_FL_EXT_DEVT; disk->private_data = pmem; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 762125f2905f..24bcae88587a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3699,7 +3699,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f81871c7128a..a5d02f236cca 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -462,7 +462,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) head->disk->fops = &nvme_ns_head_ops; head->disk->private_data = head; head->disk->queue = q; - head->disk->flags = GENHD_FL_EXT_DEVT; sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); return 0; -- cgit v1.2.3 From 958229a7c55f219b1cff99f939dabbc1b6ba7161 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:54 +0200 Subject: block: add a flag to make put_disk on partially initalized disks safer Add a flag to indicate that __device_add_disk did grab a queue reference so that disk_release only drops it if we actually had it. This sort out one of the major pitfals with partially initialized gendisk that a lot of drivers did get wrong or still do. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 +++++-- include/linux/genhd.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9fa734cb9cbd..c826db33a73e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -539,7 +539,10 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * Take an extra ref on queue which will be put on disk_release() * so that it sticks around as long as @disk is there. */ - WARN_ON_ONCE(!blk_get_queue(disk->queue)); + if (blk_get_queue(disk->queue)) + set_bit(GD_QUEUE_REF, &disk->state); + else + WARN_ON_ONCE(1); disk_add_events(disk); blk_integrity_add(disk); @@ -1107,7 +1110,7 @@ static void disk_release(struct device *dev) kfree(disk->random); xa_destroy(&disk->part_tbl); bdput(disk->part0); - if (disk->queue) + if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); kfree(disk); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fc26f7bdf71..4d3ee8b6b297 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -153,6 +153,7 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 +#define GD_QUEUE_REF 2 struct kobject *slave_dir; struct timer_rand_state *random; -- cgit v1.2.3 From f525464a8000f092c20b00eead3eaa9d849c599e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:55 +0200 Subject: block: add blk_alloc_disk and blk_cleanup_disk APIs Add two new APIs to allocate and free a gendisk including the request_queue for use with BIO based drivers. This is to avoid boilerplate code in drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 22 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index c826db33a73e..efe0db4d62f0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,6 +1293,25 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); +struct gendisk *__blk_alloc_disk(int node) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_alloc_queue(node); + if (!q) + return NULL; + + disk = __alloc_disk_node(0, node); + if (!disk) { + blk_cleanup_queue(q); + return NULL; + } + disk->queue = q; + return disk; +} +EXPORT_SYMBOL(__blk_alloc_disk); + /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for @@ -1310,6 +1329,22 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); +/** + * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk + * @disk: gendisk to shutdown + * + * Mark the queue hanging off @disk DYING, drain all pending requests, then mark + * the queue DEAD, destroy and put it and the gendisk structure. + * + * Context: can sleep + */ +void blk_cleanup_disk(struct gendisk *disk) +{ + blk_cleanup_queue(disk->queue); + put_disk(disk); +} +EXPORT_SYMBOL(blk_cleanup_disk); + static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4d3ee8b6b297..782f0171d104 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -278,6 +278,28 @@ extern void put_disk(struct gendisk *disk); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) +/** + * blk_alloc_disk - allocate a gendisk structure + * @node_id: numa node to allocate on + * + * Allocate and pre-initialize a gendisk structure for use with BIO based + * drivers. + * + * Context: can sleep + */ +#define blk_alloc_disk(node_id) \ +({ \ + struct gendisk *__disk = __blk_alloc_disk(node_id); \ + static struct lock_class_key __key; \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, \ + "(bio completion)", &__key, 0); \ + __disk; \ +}) +struct gendisk *__blk_alloc_disk(int node); +void blk_cleanup_disk(struct gendisk *disk); + int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)); #define register_blkdev(major, name) \ -- cgit v1.2.3 From da7ba72960ca2a9b968e47fcf414d16f3d4c0c42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:16 +0200 Subject: block: unexport blk_alloc_queue blk_alloc_queue is just an internal helper now, unexport it and remove it from the public header. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-27-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk.h | 2 ++ include/linux/blkdev.h | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 689aac2625d2..3515a66022d7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -599,7 +599,6 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue); /** * blk_get_queue - increment the request_queue refcount diff --git a/block/blk.h b/block/blk.h index cba3a94aabfa..3440142f029b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -359,4 +359,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +struct request_queue *blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c28577b50f4..d66d0da72529 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1213,7 +1213,6 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, extern void blk_dump_rq_flags(struct request *, char *); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); -- cgit v1.2.3 From a8698707a1835be3abd12a3b28079a80999f8dee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:56 +0200 Subject: block: move bd_mutex to struct gendisk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-block device bd_mutex with a per-gendisk open_mutex, thus simplifying locking wherever we deal with partitions. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210525061301.2242282-4-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/filesystems/locking.rst | 2 +- block/genhd.c | 7 ++++--- block/partitions/core.c | 24 ++++++++++------------- drivers/block/loop.c | 14 ++++++------- drivers/block/xen-blkfront.c | 8 ++++---- drivers/block/zram/zram_drv.c | 18 ++++++++--------- drivers/block/zram/zram_drv.h | 2 +- drivers/md/md.h | 6 +++--- drivers/s390/block/dasd_genhd.c | 8 ++++---- drivers/scsi/sd.c | 4 ++-- fs/block_dev.c | 37 ++++++++++++++--------------------- fs/btrfs/volumes.c | 2 +- fs/super.c | 8 ++++---- include/linux/blk_types.h | 1 - include/linux/genhd.h | 3 +++ 15 files changed, 68 insertions(+), 76 deletions(-) (limited to 'block') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 1e894480115b..2183fd8cc350 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -480,7 +480,7 @@ prototypes:: locking rules: ======================= =================== -ops bd_mutex +ops open_mutex ======================= =================== open: yes release: yes diff --git a/block/genhd.c b/block/genhd.c index efe0db4d62f0..38d136a19484 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -591,10 +591,10 @@ void del_gendisk(struct gendisk *disk) blk_integrity_del(disk); disk_del_events(disk); - mutex_lock(&disk->part0->bd_mutex); + mutex_lock(&disk->open_mutex); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); - mutex_unlock(&disk->part0->bd_mutex); + mutex_unlock(&disk->open_mutex); fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); @@ -1273,6 +1273,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_free_disk; disk->node_id = node_id; + mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; @@ -1525,7 +1526,7 @@ void disk_unblock_events(struct gendisk *disk) * doesn't clear the events from @disk->ev. * * CONTEXT: - * If @mask is non-zero must be called with bdev->bd_mutex held. + * If @mask is non-zero must be called with disk->open_mutex held. */ void disk_flush_events(struct gendisk *disk, unsigned int mask) { diff --git a/block/partitions/core.c b/block/partitions/core.c index ada3e1e66989..4fde8e0dd7cd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -283,7 +283,7 @@ struct device_type part_type = { }; /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static void delete_partition(struct block_device *part) @@ -312,7 +312,7 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, @@ -453,15 +453,15 @@ int bdev_add_partition(struct block_device *bdev, int partno, { struct block_device *part; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } part = add_partition(bdev->bd_disk, partno, start, length, ADDPART_FLAG_NONE, NULL); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return PTR_ERR_OR_ZERO(part); } @@ -474,8 +474,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) if (!part) return -ENXIO; - mutex_lock(&part->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + mutex_lock(&bdev->bd_disk->open_mutex); ret = -EBUSY; if (part->bd_openers) @@ -484,8 +483,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&part->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(part); return ret; } @@ -500,8 +498,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (!part) return -ENXIO; - mutex_lock(&part->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + mutex_lock(&bdev->bd_disk->open_mutex); ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; @@ -514,8 +511,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, ret = 0; out_unlock: - mutex_unlock(&part->bd_mutex); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(part); return ret; } @@ -541,7 +537,7 @@ void blk_drop_partitions(struct gendisk *disk) struct block_device *part; unsigned long idx; - lockdep_assert_held(&disk->part0->bd_mutex); + lockdep_assert_held(&disk->open_mutex); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (!bdgrab(part)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d58d68f3c7cd..95c570f5923f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -652,9 +652,9 @@ static void loop_reread_partitions(struct loop_device *lo, { int rc; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -747,7 +747,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, mutex_unlock(&lo->lo_mutex); /* * We must drop file reference outside of lo_mutex as dropping - * the file ref can take bd_mutex which creates circular locking + * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); @@ -1260,7 +1260,7 @@ out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { /* - * bd_mutex has been held already in release path, so don't + * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt @@ -1268,10 +1268,10 @@ out_unlock: * current holder is released. */ if (!release) - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); err = bdev_disk_changed(bdev, false); if (!release) - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo_number, err); @@ -1298,7 +1298,7 @@ out_unlock: /* * Need not hold lo_mutex to fput backing file. Calling fput holding * lo_mutex triggers a circular lock dependency possibility warning as - * fput can take bd_mutex which is usually taken before lo_mutex. + * fput can take open_mutex which is usually taken before lo_mutex. */ if (filp) fput(filp); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 10df39a8b18d..f2c1aedcdf5a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2163,7 +2163,7 @@ static void blkfront_closing(struct blkfront_info *info) return; } - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) { xenbus_dev_error(xbdev, -EBUSY, @@ -2174,7 +2174,7 @@ static void blkfront_closing(struct blkfront_info *info) xenbus_frontend_closed(xbdev); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(bdev); } @@ -2531,7 +2531,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) * isn't closed yet, we let release take care of it. */ - mutex_lock(&bdev->bd_mutex); + mutex_lock(&disk->open_mutex); info = disk->private_data; dev_warn(disk_to_dev(disk), @@ -2546,7 +2546,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) mutex_unlock(&blkfront_mutex); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); bdput(bdev); return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 006416cc4969..fcaf2750f68f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1781,24 +1781,24 @@ static ssize_t reset_store(struct device *dev, zram = dev_to_zram(dev); bdev = zram->disk->part0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); /* Do not reset an active device or claimed device */ if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } /* From now on, anyone can't open /dev/zram[0-9] */ zram->claim = true; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); zram->claim = false; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return len; } @@ -1808,7 +1808,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode) int ret = 0; struct zram *zram; - WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex)); zram = bdev->bd_disk->private_data; /* zram was claimed to reset so open request fails */ @@ -1972,14 +1972,14 @@ static int zram_remove(struct zram *zram) { struct block_device *bdev = zram->disk->part0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } zram->claim = true; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); zram_debugfs_unregister(zram); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 419a7e8281ee..74c411911b6e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -112,7 +112,7 @@ struct zram { /* * zram is claimed so open request will be failed */ - bool claim; /* Protected by bdev->bd_mutex */ + bool claim; /* Protected by disk->open_mutex */ struct file *backing_dev; #ifdef CONFIG_ZRAM_WRITEBACK spinlock_t wb_limit_lock; diff --git a/drivers/md/md.h b/drivers/md/md.h index fb7eab58cfd5..a88086d4110c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -395,10 +395,10 @@ struct mddev { * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions - * with bdev->bd_mutex. + * with disk->open_mutex. * Lock ordering is: - * reconfig_mutex -> bd_mutex - * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + * reconfig_mutex -> disk->open_mutex + * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 8d6587ec73e2..bf2082d461c7 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -109,9 +109,9 @@ int dasd_scan_partitions(struct dasd_block *block) return -ENODEV; } - mutex_lock(&bdev->bd_mutex); + mutex_lock(&block->gdp->open_mutex); rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&block->gdp->open_mutex); if (rc) DBF_DEV_EVENT(DBF_ERR, block->base, "scan partitions error, rc %d", rc); @@ -145,9 +145,9 @@ void dasd_destroy_partitions(struct dasd_block *block) bdev = block->bdev; block->bdev = NULL; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); bdev_disk_changed(bdev, true); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ blkdev_put(bdev, FMODE_READ); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cb3c37d1e009..d3ff723af879 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1400,7 +1400,7 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * - * Locking: called with bdev->bd_mutex held. + * Locking: called with bdev->bd_disk->open_mutex held. **/ static int sd_open(struct block_device *bdev, fmode_t mode) { @@ -1476,7 +1476,7 @@ error_out: * Note: may block (uninterruptible) if error recovery is underway * on this disk. * - * Locking: called with bdev->bd_mutex held. + * Locking: called with bdev->bd_disk->open_mutex held. **/ static void sd_release(struct gendisk *disk, fmode_t mode) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 41d2d9708bf8..e094806c3a0c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); - mutex_init(&bdev->bd_mutex); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; @@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -1199,7 +1198,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); @@ -1230,7 +1229,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif @@ -1242,7 +1241,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) struct gendisk *disk = bdev->bd_disk; int ret = 0; - lockdep_assert_held(&bdev->bd_mutex); + lockdep_assert_held(&disk->open_mutex); if (!(disk->flags & GENHD_FL_UP)) return -ENXIO; @@ -1327,14 +1326,10 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) goto done; whole = bdgrab(disk->part0); - mutex_lock_nested(&whole->bd_mutex, 1); ret = blkdev_get_whole(whole, mode); - if (ret) { - mutex_unlock(&whole->bd_mutex); + if (ret) goto out_put_whole; - } whole->bd_part_count++; - mutex_unlock(&whole->bd_mutex); ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1437,7 +1432,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&disk->open_mutex); ret = -ENXIO; if (!(disk->flags & GENHD_FL_UP)) goto abort_claiming; @@ -1463,7 +1458,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) unblock_events = false; } } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); @@ -1472,7 +1467,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) abort_claiming: if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); @@ -1552,7 +1547,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; - mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; @@ -1567,7 +1561,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); - mutex_unlock(&bdev->bd_mutex); if (victim) { __blkdev_put(victim, mode, 1); bdput(victim); @@ -1588,15 +1581,14 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) if (bdev->bd_openers == 1) sync_blockdev(bdev); - mutex_lock(&bdev->bd_mutex); - + mutex_lock(&disk->open_mutex); if (mode & FMODE_EXCL) { struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* * Release a claim on the device. The holder fields - * are protected with bdev_lock. bd_mutex is to + * are protected with bdev_lock. open_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); @@ -1627,9 +1619,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); + mutex_unlock(&disk->open_mutex); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); @@ -1936,10 +1929,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) old_inode = inode; bdev = I_BDEV(inode); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) func(bdev, arg); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 47d27059d064..f246eb2772e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the - * underlying device takes further locks like bd_mutex. + * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex diff --git a/fs/super.c b/fs/super.c index 11b7e7213fd1..91b7f156735b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ @@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index db026b6ec15a..a09660671fa4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,7 +29,6 @@ struct block_device { int bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; - struct mutex bd_mutex; /* open/close mutex */ void * bd_claiming; struct device bd_device; void * bd_holder; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 782f0171d104..1fabb1559110 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -154,6 +154,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_QUEUE_REF 2 + + struct mutex open_mutex; /* open/close mutex */ + struct kobject *slave_dir; struct timer_rand_state *random; -- cgit v1.2.3 From ab4b57057d744861f670b47b163209727b26418b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:59 +0200 Subject: block: move bd_part_count to struct gendisk The bd_part_count value only makes sense for whole devices, so move it to struct gendisk and give it a more descriptive name. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-7-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- fs/block_dev.c | 6 +++--- include/linux/blk_types.h | 3 --- include/linux/genhd.h | 1 + 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 8ba1ed8defd0..24beec9ca9c9 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -89,7 +89,7 @@ static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (bdev->bd_part_count) + if (bdev->bd_disk->open_partitions) return -EBUSY; /* diff --git a/fs/block_dev.c b/fs/block_dev.c index cd45b54e86b4..ac9b3c158a77 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1253,7 +1253,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) return -ENXIO; rescan: - if (bdev->bd_part_count) + if (disk->open_partitions) return -EBUSY; sync_blockdev(bdev); invalidate_bdev(bdev); @@ -1348,7 +1348,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) if (!bdev_nr_sectors(part)) goto out_blkdev_put; - whole->bd_part_count++; + disk->open_partitions++; set_init_blocksize(part); if (part->bd_bdi == &noop_backing_dev_info) part->bd_bdi = bdi_get(disk->queue->backing_dev_info); @@ -1370,7 +1370,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) if (--part->bd_openers) return; blkdev_flush_mapping(part); - whole->bd_part_count--; + whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); bdput(whole); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a09660671fa4..fd3860d18d7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,9 +39,6 @@ struct block_device { #endif struct kobject *bd_holder_dir; u8 bd_partno; - /* number of times partitions within this device have been opened. */ - unsigned bd_part_count; - spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1fabb1559110..47d4605c0e7e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -156,6 +156,7 @@ struct gendisk { #define GD_QUEUE_REF 2 struct mutex open_mutex; /* open/close mutex */ + unsigned open_partitions; /* number of open partitions */ struct kobject *slave_dir; -- cgit v1.2.3 From c97d93c31e5734a16bfe663085ec91b8c9fb20f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:13:00 +0200 Subject: block: factor out a part_devt helper Add a helper to find the dev_t for a disk + partno tuple. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-8-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++++++++-------- include/linux/genhd.h | 1 + init/do_mounts.c | 10 ++-------- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 38d136a19484..3f7b1c92c7f3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1227,6 +1227,19 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ +dev_t part_devt(struct gendisk *disk, u8 partno) +{ + struct block_device *part = bdget_disk(disk, partno); + dev_t devt = 0; + + if (part) { + devt = part->bd_dev; + bdput(part); + } + + return devt; +} + dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1236,7 +1249,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1247,13 +1259,10 @@ dev_t blk_lookup_devt(const char *name, int partno) */ devt = MKDEV(MAJOR(dev->devt), MINOR(dev->devt) + partno); - break; - } - part = bdget_disk(disk, partno); - if (part) { - devt = part->bd_dev; - bdput(part); - break; + } else { + devt = part_devt(disk, partno); + if (devt) + break; } } class_dev_iter_exit(&iter); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 47d4605c0e7e..64a8431202b7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -333,6 +333,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ +dev_t part_devt(struct gendisk *disk, u8 partno); dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK diff --git a/init/do_mounts.c b/init/do_mounts.c index a78e44ee6adb..74aede860de7 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -133,14 +133,8 @@ static dev_t devt_from_partuuid(const char *uuid_str) * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - struct block_device *part; - - part = bdget_disk(dev_to_disk(dev), - dev_to_bdev(dev)->bd_partno + offset); - if (part) { - devt = part->bd_dev; - bdput(part); - } + devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); } else { devt = dev->devt; } -- cgit v1.2.3 From 0e0ccdecb3cff95a350b4364e7ebbaa754d0e47d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:13:01 +0200 Subject: block: remove bdget_disk Just opencode the xa_load in the callers, as none of them actually needs a reference to the bdev. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 35 +++++------------------------------ block/partitions/core.c | 25 ++++++++++++------------- include/linux/genhd.h | 1 - 3 files changed, 17 insertions(+), 44 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 3f7b1c92c7f3..5f5628216295 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -676,32 +676,6 @@ void blk_request_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -/** - * bdget_disk - do bdget() by gendisk and partition number - * @disk: gendisk of interest - * @partno: partition number - * - * Find partition @partno from @disk, do bdget() on it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Resulting block_device on success, NULL on failure. - */ -struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - struct block_device *bdev = NULL; - - rcu_read_lock(); - bdev = xa_load(&disk->part_tbl, partno); - if (bdev && !bdgrab(bdev)) - bdev = NULL; - rcu_read_unlock(); - - return bdev; -} - /* * print a full list of all partitions - intended for places where the root * filesystem can't be mounted and thus to give the victim some idea of what @@ -1229,13 +1203,14 @@ module_init(proc_genhd_init); dev_t part_devt(struct gendisk *disk, u8 partno) { - struct block_device *part = bdget_disk(disk, partno); + struct block_device *part; dev_t devt = 0; - if (part) { + rcu_read_lock(); + part = xa_load(&disk->part_tbl, partno); + if (part) devt = part->bd_dev; - bdput(part); - } + rcu_read_unlock(); return devt; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 4fde8e0dd7cd..186d4fbd9f09 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -326,6 +326,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; + lockdep_assert_held(&disk->open_mutex); + if (partno >= disk_max_parts(disk)) return ERR_PTR(-EINVAL); @@ -467,14 +469,13 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *part; - int ret; - - part = bdget_disk(bdev->bd_disk, partno); - if (!part) - return -ENXIO; + struct block_device *part = NULL; + int ret = -ENXIO; mutex_lock(&bdev->bd_disk->open_mutex); + part = xa_load(&bdev->bd_disk->part_tbl, partno); + if (!part) + goto out_unlock; ret = -EBUSY; if (part->bd_openers) @@ -484,21 +485,20 @@ int bdev_del_partition(struct block_device *bdev, int partno) ret = 0; out_unlock: mutex_unlock(&bdev->bd_disk->open_mutex); - bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *part; - int ret = 0; + struct block_device *part = NULL; + int ret = -ENXIO; - part = bdget_disk(bdev->bd_disk, partno); + mutex_lock(&bdev->bd_disk->open_mutex); + part = xa_load(&bdev->bd_disk->part_tbl, partno); if (!part) - return -ENXIO; + goto out_unlock; - mutex_lock(&bdev->bd_disk->open_mutex); ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; @@ -512,7 +512,6 @@ int bdev_resize_partition(struct block_device *bdev, int partno, ret = 0; out_unlock: mutex_unlock(&bdev->bd_disk->open_mutex); - bdput(part); return ret; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 64a8431202b7..03d684f0498f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -223,7 +223,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); -extern struct block_device *bdget_disk(struct gendisk *disk, int partno); void set_disk_ro(struct gendisk *disk, bool read_only); -- cgit v1.2.3 From 613471549f366cdf4170b81ce0f99f3867ec4d16 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Jun 2021 12:47:21 +0200 Subject: block: Do not pull requests from the scheduler when we cannot dispatch them Provided the device driver does not implement dispatch budget accounting (which only SCSI does) the loop in __blk_mq_do_dispatch_sched() pulls requests from the IO scheduler as long as it is willing to give out any. That defeats scheduling heuristics inside the scheduler by creating false impression that the device can take more IO when it in fact cannot. For example with BFQ IO scheduler on top of virtio-blk device setting blkio cgroup weight has barely any impact on observed throughput of async IO because __blk_mq_do_dispatch_sched() always sucks out all the IO queued in BFQ. BFQ first submits IO from higher weight cgroups but when that is all dispatched, it will give out IO of lower weight cgroups as well. And then we have to wait for all this IO to be dispatched to the disk (which means lot of it actually has to complete) before the IO scheduler is queried again for dispatching more requests. This completely destroys any service differentiation. So grab request tag for a request pulled out of the IO scheduler already in __blk_mq_do_dispatch_sched() and do not pull any more requests if we cannot get it because we are unlikely to be able to dispatch it. That way only single request is going to wait in the dispatch list for some tag to free. Reviewed-by: Ming Lei Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210603104721.6309-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 12 +++++++++++- block/blk-mq.c | 2 +- block/blk-mq.h | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 045b6878b8c5..a9182d2f8ad3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -168,9 +168,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); + count++; if (rq->mq_hctx != hctx) multi_hctxs = true; - } while (++count < max_dispatch); + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); if (!count) { if (run_queue) diff --git a/block/blk-mq.c b/block/blk-mq.c index f11d4018ce2e..4261adee9964 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1104,7 +1104,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index 556368d2c5b6..4b1ca7b7bbeb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -260,6 +260,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } +bool blk_mq_get_driver_tag(struct request *rq); + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; -- cgit v1.2.3 From 7cc2623d1c84935f06fbdf727f41d70f4c779ef6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 19 May 2021 10:52:26 -0700 Subject: block: Update blk_update_request() documentation Although the original intent was to use blk_update_request() in stacking block drivers only, it is used much more widely today. Reflect this in the documentation block above this function. See also: * commit 32fab448e5e8 ("block: add request update interface"). * commit 2e60e02297cf ("block: clean up request completion API"). * commit ed6565e73424 ("block: handle partial completions for special payload requests"). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210519175226.8853-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3515a66022d7..514838ccab2d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1384,26 +1384,22 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) EXPORT_SYMBOL_GPL(blk_steal_bios); /** - * blk_update_request - Special helper function for request stacking drivers + * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code - * @nr_bytes: number of bytes to complete @req + * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_mq_end_request instead. - * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both - * blk_rq_bytes() and in blk_update_request(). + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data -- cgit v1.2.3 From 11c7aa0ddea8611007768d3e6b58d45dc60a19e1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 7 Jun 2021 13:26:13 +0200 Subject: rq-qos: fix missed wake-ups in rq_qos_throttle try two Commit 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle") tried to fix a problem that a process could be sleeping in rq_qos_wait() without anyone to wake it up. However the fix is not complete and the following can still happen: CPU1 (waiter1) CPU2 (waiter2) CPU3 (waker) rq_qos_wait() rq_qos_wait() acquire_inflight_cb() -> fails acquire_inflight_cb() -> fails completes IOs, inflight decreased prepare_to_wait_exclusive() prepare_to_wait_exclusive() has_sleeper = !wq_has_single_sleeper() -> true as there are two sleepers has_sleeper = !wq_has_single_sleeper() -> true io_schedule() io_schedule() Deadlock as now there's nobody to wakeup the two waiters. The logic automatically blocking when there are already sleepers is really subtle and the only way to make it work reliably is that we check whether there are some waiters in the queue when adding ourselves there. That way, we are guaranteed that at least the first process to enter the wait queue will recheck the waiting condition before going to sleep and thus guarantee forward progress. Fixes: 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210607112613.25344-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 4 ++-- include/linux/wait.h | 2 +- kernel/sched/wait.c | 9 +++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..e83af7bc7591 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -266,8 +266,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; - prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(&rqw->wait); + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + TASK_UNINTERRUPTIBLE); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) diff --git a/include/linux/wait.h b/include/linux/wait.h index fe10e8570a52..6598ae35e1b5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1136,7 +1136,7 @@ do { \ * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); -void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 183cc6ae68a6..76577d1642a5 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -264,17 +264,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent } EXPORT_SYMBOL(prepare_to_wait); -void +/* Returns true if we are the first waiter in the queue, false otherwise. */ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; + bool was_empty = false; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->entry)) + if (list_empty(&wq_entry->entry)) { + was_empty = list_empty(&wq_head->head); __add_wait_queue_entry_tail(wq_head, wq_entry); + } set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); + return was_empty; } EXPORT_SYMBOL(prepare_to_wait_exclusive); -- cgit v1.2.3 From cdb14e0f7775e767484843e8ecd736bb21754c58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:16 +0300 Subject: blk-mq: factor out a blk_mq_alloc_sq_tag_set helper Factour out a helper to initialize a simple single hw queue tag_set from blk_mq_init_sq_queue. This will allow to phase out blk_mq_init_sq_queue in favor of a more symmetric and general API. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 ++++++++++++++++++-------------- include/linux/blk-mq.h | 3 +++ 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4261adee9964..867e5faf4f5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3152,24 +3152,12 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - memset(set, 0, sizeof(*set)); - set->ops = ops; - set->nr_hw_queues = 1; - set->nr_maps = 1; - set->queue_depth = queue_depth; - set->numa_node = NUMA_NO_NODE; - set->flags = set_flags; - - ret = blk_mq_alloc_tag_set(set); + ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); if (ret) return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { + if (IS_ERR(q)) blk_mq_free_tag_set(set); - return q; - } - return q; } EXPORT_SYMBOL(blk_mq_init_sq_queue); @@ -3589,6 +3577,22 @@ out_free_mq_map: } EXPORT_SYMBOL(blk_mq_alloc_tag_set); +/* allocate and initialize a tagset for a simple single-queue device */ +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags) +{ + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + return blk_mq_alloc_tag_set(set); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); + void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 359486940fa0..bb950fc669ef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -439,6 +439,9 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -- cgit v1.2.3 From 26a9750aa875126e4b7fc5ee6de652a529c5b7ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:17 +0300 Subject: blk-mq: improve the blk_mq_init_allocated_queue interface Don't return the passed in request_queue but a normal error code, and drop the elevator_init argument in favor of just calling elevator_init_mq directly from dm-rq. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- block/blk.h | 1 - block/elevator.c | 2 +- drivers/md/dm-rq.c | 9 +++------ include/linux/blk-mq.h | 5 ++--- include/linux/elevator.h | 1 + 6 files changed, 21 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 867e5faf4f5b..8550ad64982f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3115,21 +3115,18 @@ void blk_mq_release(struct request_queue *q) struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { - struct request_queue *uninit_q, *q; + struct request_queue *q; + int ret; - uninit_q = blk_alloc_queue(set->numa_node); - if (!uninit_q) + q = blk_alloc_queue(set->numa_node); + if (!q) return ERR_PTR(-ENOMEM); - uninit_q->queuedata = queuedata; - - /* - * Initialize the queue without an elevator. device_add_disk() will do - * the initialization. - */ - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { + blk_cleanup_queue(q); + return ERR_PTR(ret); + } return q; } EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); @@ -3273,9 +3270,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init) +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -3325,11 +3321,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - - if (elevator_init) - elevator_init_mq(q); - - return q; + return 0; err_hctxs: kfree(q->queue_hw_ctx); @@ -3340,7 +3332,7 @@ err_poll: q->poll_cb = NULL; err_exit: q->mq_ops = NULL; - return ERR_PTR(-ENOMEM); + return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); diff --git a/block/blk.h b/block/blk.h index 3440142f029b..d3fa47af3607 100644 --- a/block/blk.h +++ b/block/blk.h @@ -192,7 +192,6 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); diff --git a/block/elevator.c b/block/elevator.c index 440699c28119..06e203426410 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -693,7 +693,7 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } - +EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9c3bc3711b33..0dbd48cbdff9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = { int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) { - struct request_queue *q; struct dm_target *immutable_tgt; int err; @@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); - if (IS_ERR(q)) { - err = PTR_ERR(q); + err = blk_mq_init_allocated_queue(md->tag_set, md->queue); + if (err) goto out_tag_set; - } - + elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bb950fc669ef..73750b2838d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -429,9 +429,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init); +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index dcb2f9022c1d..783ecb3cb77a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -120,6 +120,7 @@ extern void elv_merged_request(struct request_queue *, struct request *, extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); +void elevator_init_mq(struct request_queue *q); /* * io scheduler registration -- cgit v1.2.3 From b461dfc49eb6fbabc60b9dad476e787ada56b7b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:18 +0300 Subject: blk-mq: add the blk_mq_alloc_disk APIs Add a new API to allocate a gendisk including the request_queue for use with blk-mq based drivers. This is to avoid boilerplate code in drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 31 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8550ad64982f..b123077a0dc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3137,6 +3137,25 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_mq_init_queue_data(set, queuedata); + if (IS_ERR(q)) + return ERR_CAST(q); + + disk = __alloc_disk_node(0, set->numa_node); + if (!disk) { + blk_cleanup_queue(q); + return ERR_PTR(-ENOMEM); + } + disk->queue = q; + return disk; +} +EXPORT_SYMBOL(__blk_mq_alloc_disk); + /* * Helper for setting up a queue with mq ops, given queue depth, and * the passed in mq ops flags. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 73750b2838d2..f496c6c5b5d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -426,6 +426,18 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define blk_mq_alloc_disk(set, queuedata) \ +({ \ + static struct lock_class_key __key; \ + struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, \ + "(bio completion)", &__key, 0); \ + __disk; \ +}) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); -- cgit v1.2.3 From 08c1d480ed38995690a7d83f2c6a505f6cbbed9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:30 +0300 Subject: blk-mq: remove blk_mq_init_sq_queue All users are gone now. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-16-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ---------------------- include/linux/blk-mq.h | 4 ---- 2 files changed, 26 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b123077a0dc4..3115ea2d0990 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3156,28 +3156,6 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) } EXPORT_SYMBOL(__blk_mq_alloc_disk); -/* - * Helper for setting up a queue with mq ops, given queue depth, and - * the passed in mq ops flags. - */ -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags) -{ - struct request_queue *q; - int ret; - - ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); - if (ret) - return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) - blk_mq_free_tag_set(set); - return q; -} -EXPORT_SYMBOL(blk_mq_init_sq_queue); - static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f496c6c5b5d2..02a4aab0aeac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -443,10 +443,6 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); -- cgit v1.2.3 From 2cafe29a8d03f02a3d16193bdaae2f3e82a423f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 09:58:21 +0800 Subject: block: fix race between adding/removing rq qos and normal IO Yi reported several kernel panics on: [16687.001777] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 ... [16687.163549] pc : __rq_qos_track+0x38/0x60 or [ 997.690455] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 ... [ 997.850347] pc : __rq_qos_done+0x2c/0x50 Turns out it is caused by race between adding rq qos(wbt) and normal IO because rq_qos_add can be run when IO is being submitted, fix this issue by freezing queue before adding/deleting rq qos to queue. rq_qos_exit() needn't to freeze queue because it is called after queue has been frozen. iolatency calls rq_qos_add() during allocating queue, so freezing won't add delay because queue usage refcount works at atomic mode at that time. iocost calls rq_qos_add() when writing cgroup attribute file, that is fine to freeze queue at that time since we usually freeze queue when storing to queue sysfs attribute, meantime iocost only exists on the root cgroup. wbt_init calls it in blk_register_queue() and queue sysfs attribute store(queue_wb_lat_store() when write it 1st time in case of !BLK_WBT_MQ), the following patch will speedup the queue freezing in wbt_init. Reported-by: Yi Zhang Cc: Bart Van Assche Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20210609015822.103433-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'block') diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bc43e94f4c4..2bcb3495e376 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "blk-mq-debugfs.h" @@ -99,8 +100,21 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + * + * Reuse ->queue_lock for protecting against other concurrent + * rq_qos adding/deleting + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); rqos->next = q->rq_qos; q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); if (rqos->ops->debugfs_attrs) blk_mq_debugfs_register_rqos(rqos); @@ -110,12 +124,22 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { struct rq_qos **cur; + /* + * See comment in rq_qos_add() about freezing queue & using + * ->queue_lock. + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); blk_mq_debugfs_unregister_rqos(rqos); } -- cgit v1.2.3 From a72c374f97a4c7b2f9dde5144c867fec4bdcd798 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 09:58:22 +0800 Subject: block: mark queue init done at the end of blk_register_queue Mark queue init done when everything is done well in blk_register_queue(), so that wbt_enable_default() can be run quickly without any RCU period involved since adding rq qos requires to freeze queue. Also no any side effect by delaying to mark queue init done. Reported-by: Yi Zhang Cc: Bart Van Assche Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20210609015822.103433-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f89e2fc3963b..370d83c18057 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -866,20 +866,6 @@ int blk_register_queue(struct gendisk *disk) "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - /* - * SCSI probing may synchronously create and destroy a lot of - * request_queues for non-existent devices. Shutting down a fully - * functional queue takes measureable wallclock time as RCU grace - * periods are involved. To avoid excessive latency in these - * cases, a request_queue starts out in a degraded mode which is - * faster to shut down and is made fully functional here as - * request_queues for non-existent devices never get registered. - */ - if (!blk_queue_init_done(q)) { - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); - percpu_ref_switch_to_percpu(&q->q_usage_counter); - } - blk_queue_update_readahead(q); ret = blk_trace_init_sysfs(dev); @@ -938,6 +924,21 @@ int blk_register_queue(struct gendisk *disk) ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + + /* + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. + */ + if (!blk_queue_init_done(q)) { + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); + } + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); -- cgit v1.2.3 From f0c1c4d2864ed614f90d2da1bab1a1c42907b940 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 14:30:46 +0800 Subject: blk-mq: fix use-after-free in blk_mq_exit_sched tagset can't be used after blk_cleanup_queue() is returned because freeing tagset usually follows blk_clenup_queue(). Commit d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") adds check on q->tag_set->flags in blk_mq_exit_sched(), and causes use-after-free. Fixes it by using hctx->flags. Reported-by: syzbot+77ba3d171a25c56756ea@syzkaller.appspotmail.com Fixes: d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") Cc: John Garry Signed-off-by: Ming Lei Tested-by: John Garry Reviewed-by: John Garry Link: https://lore.kernel.org/r/20210609063046.122843-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a9182d2f8ad3..80273245d11a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -680,6 +680,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; + unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); @@ -687,12 +688,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + if (blk_mq_is_sbitmap_shared(flags)) blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } -- cgit v1.2.3 From e42cfb1da0bf33c313318da201730324c423351d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 18 Jun 2021 10:59:22 +0900 Subject: block: Remove unnecessary elevator operation checks The insert_requests and dispatch_request elevator operations are mandatory for the correct execution of an elevator, and all implemented elevators (bfq, kyber and mq-deadline) implement them. As a result, there is no need to check for these operations before calling them when a queue has an elevator set. This simplifies the code in __blk_mq_sched_dispatch_requests() and blk_mq_sched_insert_request(). To avoid out-of-tree elevators to crash the kernel in case of bad implementation, add a check in elv_register() to verify that these operations are implemented. A small, probably not significant, IOPS improvement of 0.1% is observed with this patch applied (4.117 MIOPS to 4.123 MIOPS, average of 20 fio runs doing 4K random direct reads with psync and 32 jobs). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210618015922.713999-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 13 ++++++------- block/elevator.c | 4 ++++ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 80273245d11a..2403a5c2b053 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -294,8 +294,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + const bool has_sched = q->elevator; int ret = 0; LIST_HEAD(rq_list); @@ -326,12 +325,12 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { - if (has_sched_dispatch) + if (has_sched) ret = blk_mq_do_dispatch_sched(hctx); else ret = blk_mq_do_dispatch_ctx(hctx); } - } else if (has_sched_dispatch) { + } else if (has_sched) { ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ @@ -463,7 +462,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, goto run; } - if (e && e->type->ops.insert_requests) { + if (e) { LIST_HEAD(list); list_add(&rq->queuelist, &list); @@ -494,9 +493,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_get(&q->q_usage_counter); e = hctx->queue->elevator; - if (e && e->type->ops.insert_requests) + if (e) { e->type->ops.insert_requests(hctx, list, false); - else { + } else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save diff --git a/block/elevator.c b/block/elevator.c index 06e203426410..85d0d4adbb64 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -522,6 +522,10 @@ void elv_unregister_queue(struct request_queue *q) int elv_register(struct elevator_type *e) { + /* insert_requests and dispatch_request are mandatory */ + if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) + return -EINVAL; + /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || -- cgit v1.2.3 From a79da21b48cc5f81b047ae4e70b4d9cb49c93a6a Mon Sep 17 00:00:00 2001 From: lijiazi Date: Fri, 18 Jun 2021 11:17:20 +0800 Subject: blk-wbt: remove outdated comment Now wbt_wait() returns void, so remove now outdated comment. Signed-off-by: lijiazi Link: https://lore.kernel.org/r/1623986240-13878-1-git-send-email-lijiazi@xiaomi.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 42aed0160f86..b363b0532704 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -563,7 +563,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) } /* - * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. -- cgit v1.2.3 From 5f6776ba413ce273f7cb211f1cf8771f0cde7c81 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:41 -0700 Subject: block/Kconfig: Make the BLK_WBT and BLK_WBT_MQ entries consecutive These entries were consecutive at the time of their introduction but are no longer consecutive. Make these again consecutive. Additionally, modify the help text since it refers to blk-mq and since the legacy block layer has been removed. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210618004456.7280-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index a2297edfdde8..6685578b2a20 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -133,6 +133,13 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_WBT_MQ + bool "Enable writeback throttling by default" + default y + depends on BLK_WBT + help + Enable writeback throttling by default for request-based block devices. + config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" depends on BLK_CGROUP=y @@ -155,13 +162,6 @@ config BLK_CGROUP_IOCOST distributes IO capacity between different groups based on their share of the overall weight distribution. -config BLK_WBT_MQ - bool "Multiqueue writeback throttling" - default y - depends on BLK_WBT - help - Enable writeback throttling by default on multiqueue devices. - config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" default y -- cgit v1.2.3 From 19688d7f9592b8222f530037d9328fdc90fff14c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:42 -0700 Subject: block/blk-cgroup: Swap the blk_throtl_init() and blk_iolatency_init() calls Before adding more calls in this function, simplify the error path. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Cc: Tejun Heo Cc: Christoph Hellwig Cc: Ming Lei Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210618004456.7280-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d169e2055158..3b0f6efaa2b6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1183,15 +1183,14 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); - ret = blk_throtl_init(q); + ret = blk_iolatency_init(q); if (ret) goto err_destroy_all; - ret = blk_iolatency_init(q); - if (ret) { - blk_throtl_exit(q); + ret = blk_throtl_init(q); + if (ret) goto err_destroy_all; - } + return 0; err_destroy_all: -- cgit v1.2.3 From fb44023e70224c3bd9eb949bd3ab66876bd14c56 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:43 -0700 Subject: block/blk-rq-qos: Move a function from a header file into a C file rq_qos_id_to_name() is only used in blk-mq-debugfs.c so move that function into in blk-mq-debugfs.c. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210618004456.7280-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 13 +++++++++++++ block/blk-rq-qos.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 2a75bc7401df..6ac1c86f62ef 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -937,6 +937,19 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) q->sched_debugfs_dir = NULL; } +static const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; + } + return "unknown"; +} + void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { debugfs_remove_recursive(rqos->debugfs_dir); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376..a77afbdd472c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -79,19 +79,6 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_LATENCY); } -static inline const char *rq_qos_id_to_name(enum rq_qos_id id) -{ - switch (id) { - case RQ_QOS_WBT: - return "wbt"; - case RQ_QOS_LATENCY: - return "latency"; - case RQ_QOS_COST: - return "cost"; - } - return "unknown"; -} - static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); -- cgit v1.2.3 From 556910e39249d55e23deaec479f49e7d85bc0d24 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:44 -0700 Subject: block: Introduce the ioprio rq-qos policy Introduce an rq-qos policy that assigns an I/O priority to requests based on blk-cgroup configuration settings. This policy has the following advantages over the ioprio_set() system call: - This policy is cgroup based so it has all the advantages of cgroups. - While ioprio_set() does not affect page cache writeback I/O, this rq-qos controller affects page cache writeback I/O for filesystems that support assiociating a cgroup with writeback I/O. See also Documentation/admin-guide/cgroup-v2.rst. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-5-bvanassche@acm.org Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 55 +++++++ block/Kconfig | 9 ++ block/Makefile | 1 + block/blk-cgroup.c | 5 + block/blk-ioprio.c | 262 ++++++++++++++++++++++++++++++++ block/blk-ioprio.h | 19 +++ block/blk-mq-debugfs.c | 2 + block/blk-rq-qos.h | 1 + 8 files changed, 354 insertions(+) create mode 100644 block/blk-ioprio.c create mode 100644 block/blk-ioprio.h (limited to 'block') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b1e81aa8598a..4e59925e6583 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -56,6 +56,7 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst #include #include "blk.h" +#include "blk-ioprio.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -1187,6 +1188,10 @@ int blkcg_init_queue(struct request_queue *q) if (ret) goto err_destroy_all; + ret = blk_ioprio_init(q); + if (ret) + goto err_destroy_all; + ret = blk_throtl_init(q); if (ret) goto err_destroy_all; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c new file mode 100644 index 000000000000..332a07761bf8 --- /dev/null +++ b/block/blk-ioprio.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block rq-qos policy for assigning an I/O priority class to requests. + * + * Using an rq-qos policy for assigning I/O priority class has two advantages + * over using the ioprio_set() system call: + * + * - This policy is cgroup based so it has all the advantages of cgroups. + * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos + * controller affects page cache writeback I/O for filesystems that support + * assiociating a cgroup with writeback I/O. See also + * Documentation/admin-guide/cgroup-v2.rst. + */ + +#include +#include +#include +#include +#include +#include "blk-ioprio.h" +#include "blk-rq-qos.h" + +/** + * enum prio_policy - I/O priority class policy. + * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. + * @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT. + * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into + * IOPRIO_CLASS_BE. + * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. + * + * See also . + */ +enum prio_policy { + POLICY_NO_CHANGE = 0, + POLICY_NONE_TO_RT = 1, + POLICY_RESTRICT_TO_BE = 2, + POLICY_ALL_TO_IDLE = 3, +}; + +static const char *policy_name[] = { + [POLICY_NO_CHANGE] = "no-change", + [POLICY_NONE_TO_RT] = "none-to-rt", + [POLICY_RESTRICT_TO_BE] = "restrict-to-be", + [POLICY_ALL_TO_IDLE] = "idle", +}; + +static struct blkcg_policy ioprio_policy; + +/** + * struct ioprio_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct ioprio_blkg { + struct blkg_policy_data pd; +}; + +/** + * struct ioprio_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @prio_policy: One of the IOPRIO_CLASS_* values. See also . + */ +struct ioprio_blkcg { + struct blkcg_policy_data cpd; + enum prio_policy prio_policy; +}; + +static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; +} + +static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), + struct ioprio_blkcg, cpd); +} + +static struct ioprio_blkcg * +ioprio_blkcg_from_css(struct cgroup_subsys_state *css) +{ + return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); +} + +static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); + + if (!pd) + return NULL; + + return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); +} + +static int ioprio_show_prio_policy(struct seq_file *sf, void *v) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); + + seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); + return 0; +} + +static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); + int ret; + + if (off != 0) + return -EIO; + /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ + ret = sysfs_match_string(policy_name, buf); + if (ret < 0) + return ret; + blkcg->prio_policy = ret; + + return nbytes; +} + +static struct blkg_policy_data * +ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) +{ + struct ioprio_blkg *ioprio_blkg; + + ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); + if (!ioprio_blkg) + return NULL; + + return &ioprio_blkg->pd; +} + +static void ioprio_free_pd(struct blkg_policy_data *pd) +{ + struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); + + kfree(ioprio_blkg); +} + +static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) +{ + struct ioprio_blkcg *blkcg; + + blkcg = kzalloc(sizeof(*blkcg), gfp); + if (!blkcg) + return NULL; + blkcg->prio_policy = POLICY_NO_CHANGE; + return &blkcg->cpd; +} + +static void ioprio_free_cpd(struct blkcg_policy_data *cpd) +{ + struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); + + kfree(blkcg); +} + +#define IOPRIO_ATTRS \ + { \ + .name = "prio.class", \ + .seq_show = ioprio_show_prio_policy, \ + .write = ioprio_set_prio_policy, \ + }, \ + { } /* sentinel */ + +/* cgroup v2 attributes */ +static struct cftype ioprio_files[] = { + IOPRIO_ATTRS +}; + +/* cgroup v1 attributes */ +static struct cftype ioprio_legacy_files[] = { + IOPRIO_ATTRS +}; + +static struct blkcg_policy ioprio_policy = { + .dfl_cftypes = ioprio_files, + .legacy_cftypes = ioprio_legacy_files, + + .cpd_alloc_fn = ioprio_alloc_cpd, + .cpd_free_fn = ioprio_free_cpd, + + .pd_alloc_fn = ioprio_alloc_pd, + .pd_free_fn = ioprio_free_pd, +}; + +struct blk_ioprio { + struct rq_qos rqos; +}; + +static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + + /* + * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers + * correspond to a lower priority. Hence, the max_t() below selects + * the lower priority of bi_ioprio and the cgroup I/O priority class. + * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the + * bio I/O priority is not modified. If the bio I/O priority equals + * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. + */ + bio->bi_ioprio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); +} + +static void blkcg_ioprio_exit(struct rq_qos *rqos) +{ + struct blk_ioprio *blkioprio_blkg = + container_of(rqos, typeof(*blkioprio_blkg), rqos); + + blkcg_deactivate_policy(rqos->q, &ioprio_policy); + kfree(blkioprio_blkg); +} + +static struct rq_qos_ops blkcg_ioprio_ops = { + .track = blkcg_ioprio_track, + .exit = blkcg_ioprio_exit, +}; + +int blk_ioprio_init(struct request_queue *q) +{ + struct blk_ioprio *blkioprio_blkg; + struct rq_qos *rqos; + int ret; + + blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL); + if (!blkioprio_blkg) + return -ENOMEM; + + ret = blkcg_activate_policy(q, &ioprio_policy); + if (ret) { + kfree(blkioprio_blkg); + return ret; + } + + rqos = &blkioprio_blkg->rqos; + rqos->id = RQ_QOS_IOPRIO; + rqos->ops = &blkcg_ioprio_ops; + rqos->q = q; + + /* + * Registering the rq-qos policy after activating the blk-cgroup + * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the + * rq-qos callbacks. + */ + rq_qos_add(q, rqos); + + return 0; +} + +static int __init ioprio_init(void) +{ + return blkcg_policy_register(&ioprio_policy); +} + +static void __exit ioprio_exit(void) +{ + blkcg_policy_unregister(&ioprio_policy); +} + +module_init(ioprio_init); +module_exit(ioprio_exit); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h new file mode 100644 index 000000000000..a7785c2f1aea --- /dev/null +++ b/block/blk-ioprio.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLK_IOPRIO_H_ +#define _BLK_IOPRIO_H_ + +#include + +struct request_queue; + +#ifdef CONFIG_BLK_CGROUP_IOPRIO +int blk_ioprio_init(struct request_queue *q); +#else +static inline int blk_ioprio_init(struct request_queue *q) +{ + return 0; +} +#endif + +#endif /* _BLK_IOPRIO_H_ */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6ac1c86f62ef..4b66d2776eda 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -946,6 +946,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; + case RQ_QOS_IOPRIO: + return "ioprio"; } return "unknown"; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index a77afbdd472c..f000f83e0621 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_IOPRIO, }; struct rq_wait { -- cgit v1.2.3 From 46eae2e32a6adc368230b4df0501082c5233e99c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:45 -0700 Subject: block/mq-deadline: Add several comments Make the code easier to read by adding more comments. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-6-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 8eea2cbf2bf4..31418e9ce9e2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -139,6 +139,9 @@ static void dd_request_merged(struct request_queue *q, struct request *req, } } +/* + * Callback function that is invoked after @next has been merged into @req. + */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { @@ -375,6 +378,8 @@ done: } /* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared @@ -438,6 +443,10 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) return 0; } +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { @@ -461,6 +470,10 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -518,6 +531,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } +/* + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { @@ -544,6 +560,8 @@ static void dd_prepare_request(struct request *rq) } /* + * Callback from inside blk_mq_free_request(). + * * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() -- cgit v1.2.3 From 3bd473f41ae990815d6f75d285b161eebf361278 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:46 -0700 Subject: block/mq-deadline: Add two lockdep_assert_held() statements Document the locking strategy by adding two lockdep_assert_held() statements. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-7-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 31418e9ce9e2..191ff5ce629c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -279,6 +279,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) bool reads, writes; int data_dir; + lockdep_assert_held(&dd->lock); + if (!list_empty(&dd->dispatch)) { rq = list_first_entry(&dd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); @@ -501,6 +503,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + lockdep_assert_held(&dd->lock); + /* * This may be a requeue of a write request that has locked its * target zone. If it is the case, this releases the zone lock. -- cgit v1.2.3 From 2f295beab40f13ab93c004d45372238f2066a5ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:47 -0700 Subject: block/mq-deadline: Remove two local variables Make __dd_dispatch_request() easier to read by removing two local variables. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-8-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 191ff5ce629c..caa438f62a4d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -276,7 +276,6 @@ deadline_next_request(struct deadline_data *dd, int data_dir) static struct request *__dd_dispatch_request(struct deadline_data *dd) { struct request *rq, *next_rq; - bool reads, writes; int data_dir; lockdep_assert_held(&dd->lock); @@ -287,9 +286,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) goto done; } - reads = !list_empty(&dd->fifo_list[READ]); - writes = !list_empty(&dd->fifo_list[WRITE]); - /* * batches are currently reads XOR writes */ @@ -306,7 +302,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (reads) { + if (!list_empty(&dd->fifo_list[READ])) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); if (deadline_fifo_request(dd, WRITE) && @@ -322,7 +318,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (writes) { + if (!list_empty(&dd->fifo_list[WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); -- cgit v1.2.3 From 3e9a99eba058f79736dccaf25934f8d6ca380fb3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:48 -0700 Subject: block/mq-deadline: Rename dd_init_queue() and dd_exit_queue() Change "queue" into "sched" to make the function names reflect better the purpose of these functions. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-9-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index caa438f62a4d..d823ba7cb084 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -395,7 +395,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } -static void dd_exit_queue(struct elevator_queue *e) +static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -408,7 +408,7 @@ static void dd_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; @@ -800,8 +800,8 @@ static struct elevator_type mq_deadline = { .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, - .init_sched = dd_init_queue, - .exit_sched = dd_exit_queue, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, }, #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.3 From 004a26b327c2e1ea88b2638cf16c0e30e82f297e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:49 -0700 Subject: block/mq-deadline: Improve compile-time argument checking Modern compilers complain if an out-of-range value is passed to a function argument that has an enumeration type. Let the compiler detect out-of-range data direction arguments instead of verifying the data_dir argument at runtime. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-10-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 96 +++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 47 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index d823ba7cb084..69126beff77d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -35,6 +35,13 @@ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + struct deadline_data { /* * run time data @@ -43,20 +50,20 @@ struct deadline_data { /* * requests (deadline_rq s) are present on both sort_list and fifo_list */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; /* * next in sort order. read, write or both are NULL */ - struct request *next_rq[2]; + struct request *next_rq[DD_DIR_COUNT]; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ - int fifo_expire[2]; + int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; @@ -97,7 +104,7 @@ deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) static inline void deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); if (dd->next_rq[data_dir] == rq) dd->next_rq[data_dir] = deadline_latter_request(rq); @@ -169,10 +176,10 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, static void deadline_move_request(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; + dd->next_rq[DD_READ] = NULL; + dd->next_rq[DD_WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); /* @@ -185,9 +192,10 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline int deadline_check_fifo(struct deadline_data *dd, + enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(dd->fifo_list[data_dir].next); /* * rq is expired! @@ -203,19 +211,16 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) +deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - if (list_empty(&dd->fifo_list[data_dir])) return NULL; rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -223,7 +228,7 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) * an unlocked target zone. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + list_for_each_entry(rq, &dd->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) goto out; } @@ -239,19 +244,16 @@ out: * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) +deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - rq = dd->next_rq[data_dir]; if (!rq) return NULL; - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -276,7 +278,7 @@ deadline_next_request(struct deadline_data *dd, int data_dir) static struct request *__dd_dispatch_request(struct deadline_data *dd) { struct request *rq, *next_rq; - int data_dir; + enum dd_data_dir data_dir; lockdep_assert_held(&dd->lock); @@ -289,9 +291,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, WRITE); + rq = deadline_next_request(dd, DD_WRITE); if (!rq) - rq = deadline_next_request(dd, READ); + rq = deadline_next_request(dd, DD_READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -302,14 +304,14 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (!list_empty(&dd->fifo_list[READ])) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + if (!list_empty(&dd->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_READ])); - if (deadline_fifo_request(dd, WRITE) && + if (deadline_fifo_request(dd, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; - data_dir = READ; + data_dir = DD_READ; goto dispatch_find_request; } @@ -318,13 +320,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (!list_empty(&dd->fifo_list[WRITE])) { + if (!list_empty(&dd->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_WRITE])); dd->starved = 0; - data_dir = WRITE; + data_dir = DD_WRITE; goto dispatch_find_request; } @@ -399,8 +401,8 @@ static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + BUG_ON(!list_empty(&dd->fifo_list[DD_READ])); + BUG_ON(!list_empty(&dd->fifo_list[DD_WRITE])); kfree(dd); } @@ -424,12 +426,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) } eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; + INIT_LIST_HEAD(&dd->fifo_list[DD_READ]); + INIT_LIST_HEAD(&dd->fifo_list[DD_WRITE]); + dd->sort_list[DD_READ] = RB_ROOT; + dd->sort_list[DD_WRITE] = RB_ROOT; + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; @@ -497,7 +499,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); lockdep_assert_held(&dd->lock); @@ -585,7 +587,7 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[WRITE])) + if (!list_empty(&dd->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } @@ -626,8 +628,8 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ __data = jiffies_to_msecs(__data); \ return deadline_var_show(__data, (page)); \ } -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[DD_READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[DD_WRITE], 1); SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); @@ -649,8 +651,8 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = __data; \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX, 1); STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); @@ -717,8 +719,8 @@ static int deadline_##name##_next_rq_show(void *data, \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_READ, read) +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_WRITE, write) #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) -- cgit v1.2.3 From d6d7f013d65491eaff477b9bd83b80111f5be9e4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:50 -0700 Subject: block/mq-deadline: Improve the sysfs show and store macros Define separate macros for integers and jiffies to improve readability. Use sysfs_emit() and kstrtoint() instead of sprintf() and simple_strtol(). The former functions are the recommended functions. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-11-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 66 +++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 69126beff77d..f92224ff0256 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -605,58 +605,50 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) /* * sysfs parts below */ -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +#define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ -} -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[DD_READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[DD_WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION + \ + return sysfs_emit(page, "%d\n", __VAR); \ +} +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ + *(__PTR) = __CONV(__data); \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) -- cgit v1.2.3 From 07757588e5076748308dd95ee2e3cd0b82ebb8c4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:51 -0700 Subject: block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests For interactive workloads it is important that synchronous requests are not delayed. Hence reserve 25% of scheduler tags for synchronous requests. This patch still allows asynchronous requests to fill the hardware queues since blk_mq_init_sched() makes sure that the number of scheduler requests is the double of the hardware queue depth. From blk_mq_init_sched(): q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_MAX_RQ); Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-12-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f92224ff0256..44da481c3fea 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -67,6 +67,7 @@ struct deadline_data { int fifo_batch; int writes_starved; int front_merges; + u32 async_depth; spinlock_t lock; spinlock_t zone_lock; @@ -397,6 +398,44 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(op) && !op_is_write(op)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -617,6 +656,7 @@ SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->front_merges); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -645,6 +685,7 @@ STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX) STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT @@ -658,6 +699,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), + DD_ATTR(async_depth), DD_ATTR(fifo_batch), __ATTR_NULL }; @@ -733,6 +775,15 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } +static int dd_async_depth_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->async_depth); + return 0; +} + static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&dd->lock) { @@ -775,6 +826,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(write), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, + {"async_depth", 0400, dd_async_depth_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {}, }; @@ -783,6 +835,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { static struct elevator_type mq_deadline = { .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -796,6 +850,7 @@ static struct elevator_type mq_deadline = { .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.3 From d672d325b1492f5b0e54b7226f01e2d57b58bfb4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:52 -0700 Subject: block/mq-deadline: Micro-optimize the batching algorithm When dispatching the first request of a batch, the deadline_move_request() call clears .next_rq[] for the opposite data direction. .next_rq[] is not restored when changing data direction. Fix this by not clearing .next_rq[] and by keeping track of the data direction of a batch in a variable instead. This patch is a micro-optimization because: - The number of deadline_next_request() calls for the read direction is halved. - The number of times that deadline_next_request() returns NULL is reduced. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-13-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 44da481c3fea..b09ae1f332a2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -53,6 +53,8 @@ struct deadline_data { struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; /* * next in sort order. read, write or both are NULL */ @@ -179,8 +181,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[DD_READ] = NULL; - dd->next_rq[DD_WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); /* @@ -292,10 +292,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, DD_WRITE); - if (!rq) - rq = deadline_next_request(dd, DD_READ); - + rq = deadline_next_request(dd, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -361,6 +358,7 @@ dispatch_find_request: if (!rq) return NULL; + dd->last_dir = data_dir; dd->batching = 0; dispatch_request: @@ -473,6 +471,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; + dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); -- cgit v1.2.3 From c807ab520fc3fd056c47c74ced63f9d3991a171b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:53 -0700 Subject: block/mq-deadline: Add I/O priority support Maintain one dispatch list and one FIFO list per I/O priority class: RT, BE and IDLE. Maintain statistics for each priority level. Split the debugfs attributes per priority level as follows: $ ls /sys/kernel/debug/block/.../sched/ async_depth dispatch2 read_next_rq write2_fifo_list batching read0_fifo_list starved write_next_rq dispatch0 read1_fifo_list write0_fifo_list dispatch1 read2_fifo_list write1_fifo_list Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-14-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 342 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 228 insertions(+), 114 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b09ae1f332a2..aba672a5be1e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -42,23 +42,36 @@ enum dd_data_dir { enum { DD_DIR_COUNT = 2 }; +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Next request in FIFO order. Read, write or both are NULL. */ + struct request *next_rq[DD_DIR_COUNT]; +}; + struct deadline_data { /* * run time data */ - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[DD_DIR_COUNT]; - struct list_head fifo_list[DD_DIR_COUNT]; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[DD_DIR_COUNT]; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ @@ -73,13 +86,29 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; - struct list_head dispatch; +}; + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* @@ -97,38 +126,38 @@ deadline_latter_request(struct request *rq) } static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - struct rb_root *root = deadline_rb_root(dd, rq); + struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); + if (per_prio->next_rq[data_dir] == rq) + per_prio->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(deadline_rb_root(dd, rq), rq); + elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) { - struct deadline_data *dd = q->elevator->elevator_data; - list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(dd, rq); + deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) @@ -139,13 +168,16 @@ static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); } } @@ -155,6 +187,9 @@ static void dd_request_merged(struct request_queue *q, struct request *req, static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -170,33 +205,34 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, /* * kill knowledge of next, this one is a goner */ - deadline_remove_request(q, next); + deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void -deadline_move_request(struct deadline_data *dd, struct request *rq) +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[data_dir] = deadline_latter_request(rq); + per_prio->next_rq[data_dir] = deadline_latter_request(rq); /* * take it off the sort and fifo list */ - deadline_remove_request(rq->q, rq); + deadline_remove_request(rq->q, per_prio, rq); } /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, +static inline int deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); /* * rq is expired! @@ -212,15 +248,16 @@ static inline int deadline_check_fifo(struct deadline_data *dd, * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (list_empty(&dd->fifo_list[data_dir])) + if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; @@ -229,7 +266,7 @@ deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) * an unlocked target zone. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[DD_WRITE], queuelist) { + list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) goto out; } @@ -245,12 +282,13 @@ out: * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - rq = dd->next_rq[data_dir]; + rq = per_prio->next_rq[data_dir]; if (!rq) return NULL; @@ -276,15 +314,17 @@ deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct deadline_data *dd) +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio) { struct request *rq, *next_rq; enum dd_data_dir data_dir; lockdep_assert_held(&dd->lock); - if (!list_empty(&dd->dispatch)) { - rq = list_first_entry(&dd->dispatch, struct request, queuelist); + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); list_del_init(&rq->queuelist); goto done; } @@ -292,7 +332,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, dd->last_dir); + rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -302,10 +342,10 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (!list_empty(&dd->fifo_list[DD_READ])) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_READ])); + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - if (deadline_fifo_request(dd, DD_WRITE) && + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; @@ -318,9 +358,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (!list_empty(&dd->fifo_list[DD_WRITE])) { + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_WRITE])); + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; @@ -335,14 +375,14 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = deadline_fifo_request(dd, data_dir); + rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in @@ -366,7 +406,7 @@ dispatch_request: * rq is the selected appropriate request. */ dd->batching++; - deadline_move_request(dd, rq); + deadline_move_request(dd, per_prio, rq); done: /* * If the request needs its target zone locked, do it. @@ -388,9 +428,14 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; + enum dd_prio prio; spin_lock(&dd->lock); - rq = __dd_dispatch_request(dd); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) + break; + } spin_unlock(&dd->lock); return rq; @@ -437,9 +482,14 @@ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; - BUG_ON(!list_empty(&dd->fifo_list[DD_READ])); - BUG_ON(!list_empty(&dd->fifo_list[DD_WRITE])); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + } kfree(dd); } @@ -451,22 +501,28 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) - return -ENOMEM; + return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } + if (!dd) + goto put_eq; + eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[DD_READ]); - INIT_LIST_HEAD(&dd->fifo_list[DD_WRITE]); - dd->sort_list[DD_READ] = RB_ROOT; - dd->sort_list[DD_WRITE] = RB_ROOT; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; @@ -475,10 +531,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); - INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; return 0; + +put_eq: + kobject_put(&eq->kobj); + return ret; } /* @@ -489,13 +548,16 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); @@ -538,6 +600,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; lockdep_assert_held(&dd->lock); @@ -547,15 +613,18 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); + prio = ioprio_class_to_prio[ioprio_class]; + if (blk_mq_sched_try_insert_merge(q, rq)) return; trace_block_rq_insert(rq); + per_prio = &dd->per_prio[prio]; if (at_head) { - list_add(&rq->queuelist, &dd->dispatch); + list_add(&rq->queuelist, &per_prio->dispatch); } else { - deadline_add_rq_rb(dd, rq); + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -567,7 +636,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } @@ -618,26 +687,39 @@ static void dd_prepare_request(struct request *rq) static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; if (blk_queue_is_zoned(q)) { - struct deadline_data *dd = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[DD_WRITE])) + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } } +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; - return !list_empty_careful(&dd->dispatch) || - !list_empty_careful(&dd->fifo_list[0]) || - !list_empty_careful(&dd->fifo_list[1]); + return false; } /* @@ -704,16 +786,17 @@ static struct elv_fs_entry deadline_attrs[] = { }; #ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ - return seq_list_start(&dd->fifo_list[ddir], *pos); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ @@ -721,8 +804,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - return seq_list_next(v, &dd->fifo_list[ddir], pos); \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ @@ -746,14 +830,20 @@ static int deadline_##name##_next_rq_show(void *data, \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ - struct request *rq = dd->next_rq[ddir]; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq = per_prio->next_rq[data_dir]; \ \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_WRITE, write) + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) @@ -783,50 +873,74 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } -static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) - __acquires(&dd->lock) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_lock(&dd->lock); - return seq_list_start(&dd->dispatch, *pos); -} - -static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; - - return seq_list_next(v, &dd->dispatch, pos); -} - -static void deadline_dispatch_stop(struct seq_file *m, void *v) - __releases(&dd->lock) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_unlock(&dd->lock); +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ } -static const struct seq_operations deadline_dispatch_seq_ops = { - .start = deadline_dispatch_start, - .next = deadline_dispatch_next, - .stop = deadline_dispatch_stop, - .show = blk_mq_debugfs_rq_show, -}; +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read), - DEADLINE_QUEUE_DDIR_ATTRS(write), + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, - {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS @@ -876,6 +990,6 @@ static void __exit deadline_exit(void) module_init(deadline_init); module_exit(deadline_exit); -MODULE_AUTHOR("Jens Axboe"); +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); -- cgit v1.2.3 From 38ba64d12d4cf9fa260c45d7398e2a24afaceefa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:54 -0700 Subject: block/mq-deadline: Track I/O statistics Track I/O statistics per I/O priority and export these statistics to debugfs. These statistics help developers of the deadline scheduler. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-15-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index aba672a5be1e..04d9d6b3745b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -51,6 +51,19 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; +/* I/O statistics per I/O priority. */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics for all I/O priorities (enum dd_prio). */ +struct io_stats { + struct io_stats_per_prio stats[DD_PRIO_COUNT]; +}; + /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. @@ -75,6 +88,8 @@ struct deadline_data { unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ + struct io_stats __percpu *stats; + /* * settings that change how the i/o scheduler behaves */ @@ -88,6 +103,33 @@ struct deadline_data { spinlock_t zone_lock; }; +/* Count one event of type 'event_type' and with I/O priority 'prio' */ +#define dd_count(dd, event_type, prio) do { \ + struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + local_inc(&io_stats->stats[(prio)].event_type); \ + put_cpu_ptr(io_stats); \ +} while (0) + +/* + * Returns the total number of dd_count(dd, event_type, prio) calls across all + * CPUs. No locking or barriers since it is fine if the returned sum is slightly + * outdated. + */ +#define dd_sum(dd, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, @@ -187,9 +229,12 @@ static void dd_request_merged(struct request_queue *q, struct request *req, static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, merged, prio); + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -225,6 +270,12 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, deadline_remove_request(rq->q, per_prio, rq); } +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); +} + /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) @@ -319,6 +370,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; + enum dd_prio prio; + u8 ioprio_class; lockdep_assert_held(&dd->lock); @@ -408,6 +461,9 @@ dispatch_request: dd->batching++; deadline_move_request(dd, per_prio, rq); done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, dispatched, prio); /* * If the request needs its target zone locked, do it. */ @@ -491,6 +547,8 @@ static void dd_exit_sched(struct elevator_queue *e) WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); } + free_percpu(dd->stats); + kfree(dd); } @@ -514,6 +572,11 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; + dd->stats = alloc_percpu_gfp(typeof(*dd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!dd->stats) + goto free_dd; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -535,6 +598,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) q->elevator = eq; return 0; +free_dd: + kfree(dd); + put_eq: kobject_put(&eq->kobj); return ret; @@ -614,6 +680,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, inserted, prio); if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -692,6 +759,8 @@ static void dd_finish_request(struct request *rq) const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; + dd_count(dd, completed, prio); + if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -873,6 +942,35 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } +static int dd_queued_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), + dd_queued(dd, DD_BE_PRIO), + dd_queued(dd, DD_IDLE_PRIO)); + return 0; +} + +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) + - dd_sum(dd, completed, prio); +} + +static int dd_owned_by_driver_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), + dd_owned_by_driver(dd, DD_BE_PRIO), + dd_owned_by_driver(dd, DD_IDLE_PRIO)); + return 0; +} + #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ @@ -941,6 +1039,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS -- cgit v1.2.3 From 08a9ad8bf607388d768a341957d53eae64250c2d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:55 -0700 Subject: block/mq-deadline: Add cgroup support Maintain statistics per cgroup and export these to user space. These statistics are essential for verifying whether the proper I/O priorities have been assigned to requests. An example of the statistics data with this patch applied: $ cat /sys/fs/cgroup/io.stat 11:2 rbytes=0 wbytes=0 rios=3 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 8:32 rbytes=2142720 wbytes=0 rios=105 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-16-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 6 + block/Makefile | 2 + block/mq-deadline-cgroup.c | 126 +++++ block/mq-deadline-cgroup.h | 114 +++++ block/mq-deadline-main.c | 1141 ++++++++++++++++++++++++++++++++++++++++++++ block/mq-deadline.c | 1095 ------------------------------------------ 6 files changed, 1389 insertions(+), 1095 deletions(-) create mode 100644 block/mq-deadline-cgroup.c create mode 100644 block/mq-deadline-cgroup.h create mode 100644 block/mq-deadline-main.c delete mode 100644 block/mq-deadline.c (limited to 'block') diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..64053d67a97b 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE help MQ version of the deadline IO scheduler. +config MQ_IOSCHED_DEADLINE_CGROUP + tristate + default y + depends on MQ_IOSCHED_DEADLINE + depends on BLK_CGROUP + config MQ_IOSCHED_KYBER tristate "Kyber I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index af3d044abaf1..b9db5d4edfc8 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o +mq-deadline-y += mq-deadline-main.o +mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c new file mode 100644 index 000000000000..3b4bfddec39f --- /dev/null +++ b/block/mq-deadline-cgroup.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "mq-deadline-cgroup.h" + +static struct blkcg_policy dd_blkcg_policy; + +static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp) +{ + struct dd_blkcg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + pd->stats = alloc_percpu_gfp(typeof(*pd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!pd->stats) { + kfree(pd); + return NULL; + } + return &pd->cpd; +} + +static void dd_cpd_free(struct blkcg_policy_data *cpd) +{ + struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd); + + free_percpu(dd_blkcg->stats); + kfree(dd_blkcg); +} + +static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd) +{ + return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy), + struct dd_blkcg, cpd); +} + +/* + * Convert an association between a block cgroup and a request queue into a + * pointer to the mq-deadline information associated with a (blkcg, queue) pair. + */ +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd; + + pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy); + if (!pd) + return NULL; + + return dd_blkcg_from_pd(pd); +} + +static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + static const char *const prio_class_name[] = { + [IOPRIO_CLASS_NONE] = "NONE", + [IOPRIO_CLASS_RT] = "RT", + [IOPRIO_CLASS_BE] = "BE", + [IOPRIO_CLASS_IDLE] = "IDLE", + }; + struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); + int res = 0; + u8 prio; + + for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) + res += scnprintf(buf + res, size - res, + " [%s] dispatched=%u inserted=%u merged=%u", + prio_class_name[prio], + ddcg_sum(blkcg, dispatched, prio) + + ddcg_sum(blkcg, merged, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, inserted, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, merged, prio)); + + return res; +} + +static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct dd_blkg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + return &pd->pd; +} + +static void dd_pd_free(struct blkg_policy_data *pd) +{ + struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd); + + kfree(dd_blkg); +} + +static struct blkcg_policy dd_blkcg_policy = { + .cpd_alloc_fn = dd_cpd_alloc, + .cpd_free_fn = dd_cpd_free, + + .pd_alloc_fn = dd_pd_alloc, + .pd_free_fn = dd_pd_free, + .pd_stat_fn = dd_pd_stat, +}; + +int dd_activate_policy(struct request_queue *q) +{ + return blkcg_activate_policy(q, &dd_blkcg_policy); +} + +void dd_deactivate_policy(struct request_queue *q) +{ + blkcg_deactivate_policy(q, &dd_blkcg_policy); +} + +int __init dd_blkcg_init(void) +{ + return blkcg_policy_register(&dd_blkcg_policy); +} + +void __exit dd_blkcg_exit(void) +{ + blkcg_policy_unregister(&dd_blkcg_policy); +} diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h new file mode 100644 index 000000000000..0143fd74f3ce --- /dev/null +++ b/block/mq-deadline-cgroup.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_MQ_DEADLINE_CGROUP_H_) +#define _MQ_DEADLINE_CGROUP_H_ + +#include + +struct request_queue; + +/** + * struct io_stats_per_prio - I/O statistics per I/O priority class. + * @inserted: Number of inserted requests. + * @merged: Number of merged requests. + * @dispatched: Number of dispatched requests. + * @completed: Number of I/O completions. + */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */ +struct blkcg_io_stats { + struct io_stats_per_prio stats[4]; +}; + +/** + * struct dd_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @stats: I/O statistics. + */ +struct dd_blkcg { + struct blkcg_policy_data cpd; /* must be the first member */ + struct blkcg_io_stats __percpu *stats; +}; + +/* + * Count one event of type 'event_type' and with I/O priority class + * 'prio_class'. + */ +#define ddcg_count(ddcg, event_type, prio_class) do { \ +if (ddcg) { \ + struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio_class), u8)); \ + local_inc(&io_stats->stats[(prio_class)].event_type); \ + put_cpu_ptr(io_stats); \ +} \ +} while (0) + +/* + * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls + * across all CPUs. No locking or barriers since it is fine if the returned + * sum is slightly outdated. + */ +#define ddcg_sum(ddcg, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio), u8)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +#ifdef CONFIG_BLK_CGROUP + +/** + * struct dd_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct dd_blkg { + struct blkg_policy_data pd; /* must be the first member */ +}; + +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio); +int dd_activate_policy(struct request_queue *q); +void dd_deactivate_policy(struct request_queue *q); +int __init dd_blkcg_init(void); +void __exit dd_blkcg_exit(void); + +#else /* CONFIG_BLK_CGROUP */ + +static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + return NULL; +} + +static inline int dd_activate_policy(struct request_queue *q) +{ + return 0; +} + +static inline void dd_deactivate_policy(struct request_queue *q) +{ +} + +static inline int dd_blkcg_init(void) +{ + return 0; +} + +static inline void dd_blkcg_exit(void) +{ +} + +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _MQ_DEADLINE_CGROUP_H_ */ diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c new file mode 100644 index 000000000000..58a401ea8f56 --- /dev/null +++ b/block/mq-deadline-main.c @@ -0,0 +1,1141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" +#include "mq-deadline-cgroup.h" + +/* + * See Documentation/block/deadline-iosched.rst + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* I/O statistics for all I/O priorities (enum dd_prio). */ +struct io_stats { + struct io_stats_per_prio stats[DD_PRIO_COUNT]; +}; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Next request in FIFO order. Read, write or both are NULL. */ + struct request *next_rq[DD_DIR_COUNT]; +}; + +struct deadline_data { + /* + * run time data + */ + + /* Request queue that owns this data structure. */ + struct request_queue *queue; + + struct dd_per_prio per_prio[DD_PRIO_COUNT]; + + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + struct io_stats __percpu *stats; + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[DD_DIR_COUNT]; + int fifo_batch; + int writes_starved; + int front_merges; + u32 async_depth; + + spinlock_t lock; + spinlock_t zone_lock; +}; + +/* Count one event of type 'event_type' and with I/O priority 'prio' */ +#define dd_count(dd, event_type, prio) do { \ + struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + local_inc(&io_stats->stats[(prio)].event_type); \ + put_cpu_ptr(io_stats); \ +} while (0) + +/* + * Returns the total number of dd_count(dd, event_type, prio) calls across all + * CPUs. No locking or barriers since it is fine if the returned sum is slightly + * outdated. + */ +#define dd_sum(dd, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, +}; + +static inline struct rb_root * +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) +{ + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(per_prio, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) +{ + const enum dd_data_dir data_dir = rq_data_dir(rq); + + if (per_prio->next_rq[data_dir] == rq) + per_prio->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(per_prio, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) +{ + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(per_prio, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); + } +} + +/* + * Callback function that is invoked after @next has been merged into @req. + */ +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_blkcg *blkcg = next->elv.priv[0]; + + dd_count(dd, merged, prio); + ddcg_count(blkcg, merged, ioprio_class); + + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, &dd->per_prio[prio], next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) +{ + const enum dd_data_dir data_dir = rq_data_dir(rq); + + per_prio->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, per_prio, rq); +} + +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq; + unsigned long flags; + + if (list_empty(&per_prio->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq; + unsigned long flags; + + rq = per_prio->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio) +{ + struct request *rq, *next_rq; + enum dd_data_dir data_dir; + struct dd_blkcg *blkcg; + enum dd_prio prio; + u8 ioprio_class; + + lockdep_assert_held(&dd->lock); + + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + /* + * batches are currently reads XOR writes + */ + rq = deadline_next_request(dd, per_prio, dd->last_dir); + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); + + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && + (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = DD_READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); + + dd->starved = 0; + + data_dir = DD_WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = deadline_fifo_request(dd, per_prio, data_dir); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = next_rq; + } + + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + + dd->last_dir = data_dir; + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, per_prio, rq); +done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, dispatched, prio); + blkcg = rq->elv.priv[0]; + ddcg_count(blkcg, dispatched, ioprio_class); + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); + rq->rq_flags |= RQF_STARTED; + return rq; +} + +/* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * + * One confusing aspect here is that we get called for a specific + * hardware queue, but we may return a request that is for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + enum dd_prio prio; + + spin_lock(&dd->lock); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) + break; + } + spin_unlock(&dd->lock); + + return rq; +} + +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(op) && !op_is_write(op)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + +static void dd_exit_sched(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; + + dd_deactivate_policy(dd->queue); + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + } + + free_percpu(dd->stats); + + kfree(dd); +} + +/* + * Initialize elevator private data (deadline_data) and associate with blkcg. + */ +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; + + /* + * Initialization would be very tricky if the queue is not frozen, + * hence the warning statement below. + */ + WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter)); + + eq = elevator_alloc(q, e); + if (!eq) + return ret; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) + goto put_eq; + + eq->elevator_data = dd; + + dd->stats = alloc_percpu_gfp(typeof(*dd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!dd->stats) + goto free_dd; + + dd->queue = q; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->last_dir = DD_WRITE; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); + + ret = dd_activate_policy(q); + if (ret) + goto free_stats; + + ret = 0; + q->elevator = eq; + return 0; + +free_stats: + free_percpu(dd->stats); + +free_dd: + kfree(dd); + +put_eq: + kobject_put(&eq->kobj); + return ret; +} + +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct deadline_data *dd = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + + if (free) + blk_mq_free_request(free); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; + struct dd_blkcg *blkcg; + + lockdep_assert_held(&dd->lock); + + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + + /* + * If a block cgroup has been associated with the submitter and if an + * I/O priority has been set in the associated block cgroup, use the + * lowest of the cgroup priority and the request priority for the + * request. If no priority has been set in the request, use the cgroup + * priority. + */ + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, inserted, prio); + blkcg = dd_blkcg_from_bio(rq->bio); + ddcg_count(blkcg, inserted, ioprio_class); + WARN_ON_ONCE(rq->elv.priv[0]); + rq->elv.priv[0] = blkcg; + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + trace_block_rq_insert(rq); + + per_prio = &dd->per_prio[prio]; + if (at_head) { + list_add(&rq->queuelist, &per_prio->dispatch); + } else { + deadline_add_rq_rb(per_prio, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); + } +} + +/* + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + */ +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + } + spin_unlock(&dd->lock); +} + +/* Callback from inside blk_mq_rq_ctx_init(). */ +static void dd_prepare_request(struct request *rq) +{ + rq->elv.priv[0] = NULL; +} + +/* + * Callback from inside blk_mq_free_request(). + * + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. + * + * For a zoned block device, __dd_dispatch_request() may have stopped + * dispatching requests if all the queued requests are write requests directed + * at zones that are already locked due to on-going write requests. To ensure + * write request dispatch progress in this case, mark the queue as needing a + * restart to ensure that the queue is run again after completion of the + * request and zones being unlocked. + */ +static void dd_finish_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + struct dd_blkcg *blkcg = rq->elv.priv[0]; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + dd_count(dd, completed, prio); + ddcg_count(blkcg, completed, ioprio_class); + + if (blk_queue_is_zoned(q)) { + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; + + return false; +} + +/* + * sysfs parts below + */ +#define SHOW_INT(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + \ + return sysfs_emit(page, "%d\n", __VAR); \ +} +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = __CONV(__data); \ + return count; \ +} +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); +#undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES + +#define DD_ATTR(name) \ + __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(async_depth), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +#ifdef CONFIG_BLK_DEBUG_FS +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ +static void *deadline_##name##_fifo_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ +} \ + \ +static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ +} \ + \ +static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ + .start = deadline_##name##_fifo_start, \ + .next = deadline_##name##_fifo_next, \ + .stop = deadline_##name##_fifo_stop, \ + .show = blk_mq_debugfs_rq_show, \ +}; \ + \ +static int deadline_##name##_next_rq_show(void *data, \ + struct seq_file *m) \ +{ \ + struct request_queue *q = data; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq = per_prio->next_rq[data_dir]; \ + \ + if (rq) \ + __blk_mq_debugfs_rq_show(m, rq); \ + return 0; \ +} + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); +#undef DEADLINE_DEBUGFS_DDIR_ATTRS + +static int deadline_batching_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->batching); + return 0; +} + +static int deadline_starved_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->starved); + return 0; +} + +static int dd_async_depth_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->async_depth); + return 0; +} + +static int dd_queued_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), + dd_queued(dd, DD_BE_PRIO), + dd_queued(dd, DD_IDLE_PRIO)); + return 0; +} + +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) + - dd_sum(dd, completed, prio); +} + +static int dd_owned_by_driver_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), + dd_owned_by_driver(dd, DD_BE_PRIO), + dd_owned_by_driver(dd, DD_IDLE_PRIO)); + return 0; +} + +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} + +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR + +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ + {#name "_next_rq", 0400, deadline_##name##_next_rq_show} +static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), + {"batching", 0400, deadline_batching_show}, + {"starved", 0400, deadline_starved_show}, + {"async_depth", 0400, dd_async_depth_show}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, + {}, +}; +#undef DEADLINE_QUEUE_DDIR_ATTRS +#endif + +static struct elevator_type mq_deadline = { + .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, + .insert_requests = dd_insert_requests, + .dispatch_request = dd_dispatch_request, + .prepare_request = dd_prepare_request, + .finish_request = dd_finish_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, + }, + +#ifdef CONFIG_BLK_DEBUG_FS + .queue_debugfs_attrs = deadline_queue_debugfs_attrs, +#endif + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, + .elevator_owner = THIS_MODULE, +}; +MODULE_ALIAS("mq-deadline-iosched"); + +static int __init deadline_init(void) +{ + int ret; + + ret = elv_register(&mq_deadline); + if (ret) + goto out; + ret = dd_blkcg_init(); + if (ret) + goto unreg; + +out: + return ret; + +unreg: + elv_unregister(&mq_deadline); + goto out; +} + +static void __exit deadline_exit(void) +{ + dd_blkcg_exit(); + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/mq-deadline.c b/block/mq-deadline.c deleted file mode 100644 index 04d9d6b3745b..000000000000 --- a/block/mq-deadline.c +++ /dev/null @@ -1,1095 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, - * for the blk-mq scheduling framework - * - * Copyright (C) 2016 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "blk.h" -#include "blk-mq.h" -#include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" -#include "blk-mq-sched.h" - -/* - * See Documentation/block/deadline-iosched.rst - */ -static const int read_expire = HZ / 2; /* max time before a read is submitted. */ -static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ -static const int writes_starved = 2; /* max times reads can starve a write */ -static const int fifo_batch = 16; /* # of sequential requests treated as one - by the above parameters. For throughput. */ - -enum dd_data_dir { - DD_READ = READ, - DD_WRITE = WRITE, -}; - -enum { DD_DIR_COUNT = 2 }; - -enum dd_prio { - DD_RT_PRIO = 0, - DD_BE_PRIO = 1, - DD_IDLE_PRIO = 2, - DD_PRIO_MAX = 2, -}; - -enum { DD_PRIO_COUNT = 3 }; - -/* I/O statistics per I/O priority. */ -struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - -/* I/O statistics for all I/O priorities (enum dd_prio). */ -struct io_stats { - struct io_stats_per_prio stats[DD_PRIO_COUNT]; -}; - -/* - * Deadline scheduler data per I/O priority (enum dd_prio). Requests are - * present on both sort_list[] and fifo_list[]. - */ -struct dd_per_prio { - struct list_head dispatch; - struct rb_root sort_list[DD_DIR_COUNT]; - struct list_head fifo_list[DD_DIR_COUNT]; - /* Next request in FIFO order. Read, write or both are NULL. */ - struct request *next_rq[DD_DIR_COUNT]; -}; - -struct deadline_data { - /* - * run time data - */ - - struct dd_per_prio per_prio[DD_PRIO_COUNT]; - - /* Data direction of latest dispatched request. */ - enum dd_data_dir last_dir; - unsigned int batching; /* number of sequential requests made */ - unsigned int starved; /* times reads have starved writes */ - - struct io_stats __percpu *stats; - - /* - * settings that change how the i/o scheduler behaves - */ - int fifo_expire[DD_DIR_COUNT]; - int fifo_batch; - int writes_starved; - int front_merges; - u32 async_depth; - - spinlock_t lock; - spinlock_t zone_lock; -}; - -/* Count one event of type 'event_type' and with I/O priority 'prio' */ -#define dd_count(dd, event_type, prio) do { \ - struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - local_inc(&io_stats->stats[(prio)].event_type); \ - put_cpu_ptr(io_stats); \ -} while (0) - -/* - * Returns the total number of dd_count(dd, event_type, prio) calls across all - * CPUs. No locking or barriers since it is fine if the returned sum is slightly - * outdated. - */ -#define dd_sum(dd, event_type, prio) ({ \ - unsigned int cpu; \ - u32 sum = 0; \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - for_each_present_cpu(cpu) \ - sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ - stats[(prio)].event_type); \ - sum; \ -}) - -/* Maps an I/O priority class to a deadline scheduler priority. */ -static const enum dd_prio ioprio_class_to_prio[] = { - [IOPRIO_CLASS_NONE] = DD_BE_PRIO, - [IOPRIO_CLASS_RT] = DD_RT_PRIO, - [IOPRIO_CLASS_BE] = DD_BE_PRIO, - [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, -}; - -static inline struct rb_root * -deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) -{ - return &per_prio->sort_list[rq_data_dir(rq)]; -} - -/* - * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a - * request. - */ -static u8 dd_rq_ioclass(struct request *rq) -{ - return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -static void -deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) -{ - struct rb_root *root = deadline_rb_root(per_prio, rq); - - elv_rb_add(root, rq); -} - -static inline void -deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) -{ - const enum dd_data_dir data_dir = rq_data_dir(rq); - - if (per_prio->next_rq[data_dir] == rq) - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - - elv_rb_del(deadline_rb_root(per_prio, rq), rq); -} - -/* - * remove rq from rbtree and fifo. - */ -static void deadline_remove_request(struct request_queue *q, - struct dd_per_prio *per_prio, - struct request *rq) -{ - list_del_init(&rq->queuelist); - - /* - * We might not be on the rbtree, if we are doing an insert merge - */ - if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(per_prio, rq); - - elv_rqhash_del(q, rq); - if (q->last_merge == rq) - q->last_merge = NULL; -} - -static void dd_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(req); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - /* - * if the merge was a front merge, we need to reposition request - */ - if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(per_prio, req), req); - deadline_add_rq_rb(per_prio, req); - } -} - -/* - * Callback function that is invoked after @next has been merged into @req. - */ -static void dd_merged_requests(struct request_queue *q, struct request *req, - struct request *next) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(next); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - - dd_count(dd, merged, prio); - - /* - * if next expires before rq, assign its expire time to rq - * and move into next position (next will be deleted) in fifo - */ - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before((unsigned long)next->fifo_time, - (unsigned long)req->fifo_time)) { - list_move(&req->queuelist, &next->queuelist); - req->fifo_time = next->fifo_time; - } - } - - /* - * kill knowledge of next, this one is a goner - */ - deadline_remove_request(q, &dd->per_prio[prio], next); -} - -/* - * move an entry to dispatch queue - */ -static void -deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - struct request *rq) -{ - const enum dd_data_dir data_dir = rq_data_dir(rq); - - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - - /* - * take it off the sort and fifo list - */ - deadline_remove_request(rq->q, per_prio, rq); -} - -/* Number of requests queued for a given priority level. */ -static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); -} - -/* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) - */ -static inline int deadline_check_fifo(struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; -} - -/* - * For the specified data direction, return the next request to - * dispatch using arrival ordered lists. - */ -static struct request * -deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq; - unsigned long flags; - - if (list_empty(&per_prio->fifo_list[data_dir])) - return NULL; - - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) - goto out; - } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; -} - -/* - * For the specified data direction, return the next request to - * dispatch using sector position sorted lists. - */ -static struct request * -deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq; - unsigned long flags; - - rq = per_prio->next_rq[data_dir]; - if (!rq) - return NULL; - - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; - rq = deadline_latter_request(rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; -} - -/* - * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc - */ -static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) -{ - struct request *rq, *next_rq; - enum dd_data_dir data_dir; - enum dd_prio prio; - u8 ioprio_class; - - lockdep_assert_held(&dd->lock); - - if (!list_empty(&per_prio->dispatch)) { - rq = list_first_entry(&per_prio->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); - goto done; - } - - /* - * batches are currently reads XOR writes - */ - rq = deadline_next_request(dd, per_prio, dd->last_dir); - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ - goto dispatch_request; - - /* - * at this point we are not running a batch. select the appropriate - * data direction (read / write) - */ - - if (!list_empty(&per_prio->fifo_list[DD_READ])) { - BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - - if (deadline_fifo_request(dd, per_prio, DD_WRITE) && - (dd->starved++ >= dd->writes_starved)) - goto dispatch_writes; - - data_dir = DD_READ; - - goto dispatch_find_request; - } - - /* - * there are either no reads or writes have been starved - */ - - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { -dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); - - dd->starved = 0; - - data_dir = DD_WRITE; - - goto dispatch_find_request; - } - - return NULL; - -dispatch_find_request: - /* - * we are not running a batch, find best request for selected data_dir - */ - next_rq = deadline_next_request(dd, per_prio, data_dir); - if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { - /* - * A deadline has expired, the last request was in the other - * direction, or we have run out of higher-sectored requests. - * Start again from the request with the earliest expiry time. - */ - rq = deadline_fifo_request(dd, per_prio, data_dir); - } else { - /* - * The last req was the same dir and we have a next request in - * sort order. No expired requests so continue on from here. - */ - rq = next_rq; - } - - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ - if (!rq) - return NULL; - - dd->last_dir = data_dir; - dd->batching = 0; - -dispatch_request: - /* - * rq is the selected appropriate request. - */ - dd->batching++; - deadline_move_request(dd, per_prio, rq); -done: - ioprio_class = dd_rq_ioclass(rq); - prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, dispatched, prio); - /* - * If the request needs its target zone locked, do it. - */ - blk_req_zone_write_lock(rq); - rq->rq_flags |= RQF_STARTED; - return rq; -} - -/* - * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). - * - * One confusing aspect here is that we get called for a specific - * hardware queue, but we may return a request that is for a - * different hardware queue. This is because mq-deadline has shared - * state for all hardware queues, in terms of sorting, FIFOs, etc. - */ -static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; - enum dd_prio prio; - - spin_lock(&dd->lock); - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); - if (rq) - break; - } - spin_unlock(&dd->lock); - - return rq; -} - -/* - * Called by __blk_mq_alloc_request(). The shallow_depth value set by this - * function is used by __blk_mq_get_tag(). - */ -static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -{ - struct deadline_data *dd = data->q->elevator->elevator_data; - - /* Do not throttle synchronous reads. */ - if (op_is_sync(op) && !op_is_write(op)) - return; - - /* - * Throttle asynchronous requests and writes such that these requests - * do not block the allocation of synchronous requests. - */ - data->shallow_depth = dd->async_depth; -} - -/* Called by blk_mq_update_nr_requests(). */ -static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); - - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); -} - -/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ -static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - dd_depth_updated(hctx); - return 0; -} - -static void dd_exit_sched(struct elevator_queue *e) -{ - struct deadline_data *dd = e->elevator_data; - enum dd_prio prio; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); - WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); - } - - free_percpu(dd->stats); - - kfree(dd); -} - -/* - * initialize elevator private data (deadline_data). - */ -static int dd_init_sched(struct request_queue *q, struct elevator_type *e) -{ - struct deadline_data *dd; - struct elevator_queue *eq; - enum dd_prio prio; - int ret = -ENOMEM; - - eq = elevator_alloc(q, e); - if (!eq) - return ret; - - dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) - goto put_eq; - - eq->elevator_data = dd; - - dd->stats = alloc_percpu_gfp(typeof(*dd->stats), - GFP_KERNEL | __GFP_ZERO); - if (!dd->stats) - goto free_dd; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - INIT_LIST_HEAD(&per_prio->dispatch); - INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); - INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); - per_prio->sort_list[DD_READ] = RB_ROOT; - per_prio->sort_list[DD_WRITE] = RB_ROOT; - } - dd->fifo_expire[DD_READ] = read_expire; - dd->fifo_expire[DD_WRITE] = write_expire; - dd->writes_starved = writes_starved; - dd->front_merges = 1; - dd->last_dir = DD_WRITE; - dd->fifo_batch = fifo_batch; - spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); - - q->elevator = eq; - return 0; - -free_dd: - kfree(dd); - -put_eq: - kobject_put(&eq->kobj); - return ret; -} - -/* - * Try to merge @bio into an existing request. If @bio has been merged into - * an existing request, store the pointer to that request into *@rq. - */ -static int dd_request_merge(struct request_queue *q, struct request **rq, - struct bio *bio) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - sector_t sector = bio_end_sector(bio); - struct request *__rq; - - if (!dd->front_merges) - return ELEVATOR_NO_MERGE; - - __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); - if (__rq) { - BUG_ON(sector != blk_rq_pos(__rq)); - - if (elv_bio_merge_ok(__rq, bio)) { - *rq = __rq; - return ELEVATOR_FRONT_MERGE; - } - } - - return ELEVATOR_NO_MERGE; -} - -/* - * Attempt to merge a bio into an existing request. This function is called - * before @bio is associated with a request. - */ -static bool dd_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct request *free = NULL; - bool ret; - - spin_lock(&dd->lock); - ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); - spin_unlock(&dd->lock); - - if (free) - blk_mq_free_request(free); - - return ret; -} - -/* - * add rq to rbtree and fifo - */ -static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - const enum dd_data_dir data_dir = rq_data_dir(rq); - u16 ioprio = req_get_ioprio(rq); - u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); - struct dd_per_prio *per_prio; - enum dd_prio prio; - - lockdep_assert_held(&dd->lock); - - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - - prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, inserted, prio); - - if (blk_mq_sched_try_insert_merge(q, rq)) - return; - - trace_block_rq_insert(rq); - - per_prio = &dd->per_prio[prio]; - if (at_head) { - list_add(&rq->queuelist, &per_prio->dispatch); - } else { - deadline_add_rq_rb(per_prio, rq); - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); - if (!q->last_merge) - q->last_merge = rq; - } - - /* - * set expire time and add to fifo list - */ - rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); - } -} - -/* - * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). - */ -static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, at_head); - } - spin_unlock(&dd->lock); -} - -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ -static void dd_prepare_request(struct request *rq) -{ -} - -/* - * Callback from inside blk_mq_free_request(). - * - * For zoned block devices, write unlock the target zone of - * completed write requests. Do this while holding the zone lock - * spinlock so that the zone is never unlocked while deadline_fifo_request() - * or deadline_next_request() are executing. This function is called for - * all requests, whether or not these requests complete successfully. - * - * For a zoned block device, __dd_dispatch_request() may have stopped - * dispatching requests if all the queued requests are write requests directed - * at zones that are already locked due to on-going write requests. To ensure - * write request dispatch progress in this case, mark the queue as needing a - * restart to ensure that the queue is run again after completion of the - * request and zones being unlocked. - */ -static void dd_finish_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(rq); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - dd_count(dd, completed, prio); - - if (blk_queue_is_zoned(q)) { - unsigned long flags; - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - spin_unlock_irqrestore(&dd->zone_lock, flags); - } -} - -static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) -{ - return !list_empty_careful(&per_prio->dispatch) || - !list_empty_careful(&per_prio->fifo_list[DD_READ]) || - !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); -} - -static bool dd_has_work(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio prio; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) - if (dd_has_work_for_prio(&dd->per_prio[prio])) - return true; - - return false; -} - -/* - * sysfs parts below - */ -#define SHOW_INT(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - \ - return sysfs_emit(page, "%d\n", __VAR); \ -} -#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) -SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); -SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); -SHOW_INT(deadline_writes_starved_show, dd->writes_starved); -SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->front_merges); -SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); -#undef SHOW_INT -#undef SHOW_JIFFIES - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data, __ret; \ - \ - __ret = kstrtoint(page, 0, &__data); \ - if (__ret < 0) \ - return __ret; \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - *(__PTR) = __CONV(__data); \ - return count; \ -} -#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ - STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) -#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ - STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) -STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); -STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); -STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); -STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); -STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); -#undef STORE_FUNCTION -#undef STORE_INT -#undef STORE_JIFFIES - -#define DD_ATTR(name) \ - __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) - -static struct elv_fs_entry deadline_attrs[] = { - DD_ATTR(read_expire), - DD_ATTR(write_expire), - DD_ATTR(writes_starved), - DD_ATTR(front_merges), - DD_ATTR(async_depth), - DD_ATTR(fifo_batch), - __ATTR_NULL -}; - -#ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ -static void *deadline_##name##_fifo_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ -} \ - \ -static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ - loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ -} \ - \ -static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ - .start = deadline_##name##_fifo_start, \ - .next = deadline_##name##_fifo_next, \ - .stop = deadline_##name##_fifo_stop, \ - .show = blk_mq_debugfs_rq_show, \ -}; \ - \ -static int deadline_##name##_next_rq_show(void *data, \ - struct seq_file *m) \ -{ \ - struct request_queue *q = data; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - struct request *rq = per_prio->next_rq[data_dir]; \ - \ - if (rq) \ - __blk_mq_debugfs_rq_show(m, rq); \ - return 0; \ -} - -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); -#undef DEADLINE_DEBUGFS_DDIR_ATTRS - -static int deadline_batching_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->batching); - return 0; -} - -static int deadline_starved_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->starved); - return 0; -} - -static int dd_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->async_depth); - return 0; -} - -static int dd_queued_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), - dd_queued(dd, DD_BE_PRIO), - dd_queued(dd, DD_IDLE_PRIO)); - return 0; -} - -/* Number of requests owned by the block driver for a given priority. */ -static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) - - dd_sum(dd, completed, prio); -} - -static int dd_owned_by_driver_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), - dd_owned_by_driver(dd, DD_BE_PRIO), - dd_owned_by_driver(dd, DD_IDLE_PRIO)); - return 0; -} - -#define DEADLINE_DISPATCH_ATTR(prio) \ -static void *deadline_dispatch##prio##_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->dispatch, *pos); \ -} \ - \ -static void *deadline_dispatch##prio##_next(struct seq_file *m, \ - void *v, loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->dispatch, pos); \ -} \ - \ -static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ - .start = deadline_dispatch##prio##_start, \ - .next = deadline_dispatch##prio##_next, \ - .stop = deadline_dispatch##prio##_stop, \ - .show = blk_mq_debugfs_rq_show, \ -} - -DEADLINE_DISPATCH_ATTR(0); -DEADLINE_DISPATCH_ATTR(1); -DEADLINE_DISPATCH_ATTR(2); -#undef DEADLINE_DISPATCH_ATTR - -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, \ - .seq_ops = &deadline_##name##_fifo_seq_ops} -#define DEADLINE_NEXT_RQ_ATTR(name) \ - {#name "_next_rq", 0400, deadline_##name##_next_rq_show} -static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read0), - DEADLINE_QUEUE_DDIR_ATTRS(write0), - DEADLINE_QUEUE_DDIR_ATTRS(read1), - DEADLINE_QUEUE_DDIR_ATTRS(write1), - DEADLINE_QUEUE_DDIR_ATTRS(read2), - DEADLINE_QUEUE_DDIR_ATTRS(write2), - DEADLINE_NEXT_RQ_ATTR(read0), - DEADLINE_NEXT_RQ_ATTR(write0), - DEADLINE_NEXT_RQ_ATTR(read1), - DEADLINE_NEXT_RQ_ATTR(write1), - DEADLINE_NEXT_RQ_ATTR(read2), - DEADLINE_NEXT_RQ_ATTR(write2), - {"batching", 0400, deadline_batching_show}, - {"starved", 0400, deadline_starved_show}, - {"async_depth", 0400, dd_async_depth_show}, - {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, - {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, - {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, - {"owned_by_driver", 0400, dd_owned_by_driver_show}, - {"queued", 0400, dd_queued_show}, - {}, -}; -#undef DEADLINE_QUEUE_DDIR_ATTRS -#endif - -static struct elevator_type mq_deadline = { - .ops = { - .depth_updated = dd_depth_updated, - .limit_depth = dd_limit_depth, - .insert_requests = dd_insert_requests, - .dispatch_request = dd_dispatch_request, - .prepare_request = dd_prepare_request, - .finish_request = dd_finish_request, - .next_request = elv_rb_latter_request, - .former_request = elv_rb_former_request, - .bio_merge = dd_bio_merge, - .request_merge = dd_request_merge, - .requests_merged = dd_merged_requests, - .request_merged = dd_request_merged, - .has_work = dd_has_work, - .init_sched = dd_init_sched, - .exit_sched = dd_exit_sched, - .init_hctx = dd_init_hctx, - }, - -#ifdef CONFIG_BLK_DEBUG_FS - .queue_debugfs_attrs = deadline_queue_debugfs_attrs, -#endif - .elevator_attrs = deadline_attrs, - .elevator_name = "mq-deadline", - .elevator_alias = "deadline", - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, - .elevator_owner = THIS_MODULE, -}; -MODULE_ALIAS("mq-deadline-iosched"); - -static int __init deadline_init(void) -{ - return elv_register(&mq_deadline); -} - -static void __exit deadline_exit(void) -{ - elv_unregister(&mq_deadline); -} - -module_init(deadline_init); -module_exit(deadline_exit); - -MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MQ deadline IO scheduler"); -- cgit v1.2.3 From fb926032b3209300f9dc454a36b8299582ae545c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:56 -0700 Subject: block/mq-deadline: Prioritize high-priority requests While one or more requests with a certain I/O priority are pending, do not dispatch lower priority requests. Dispatch lower priority requests anyway after the "aging" time has expired. This patch has been tested as follows: modprobe scsi_debug ndelay=1000000 max_queue=16 && sd='' && while [ -z "$sd" ]; do sd=/dev/$(basename /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/*) done && echo $((100*1000)) > /sys/block/$sd/queue/iosched/aging_expire && cd /sys/fs/cgroup/blkio/ && echo $$ >cgroup.procs && echo restrict-to-be >blkio.prio.class && mkdir -p hipri && cd hipri && echo none-to-rt >blkio.prio.class && { max-iops -a1 -d32 -j1 -e mq-deadline $sd >& ~/low-pri.txt & } && echo $$ >cgroup.procs && max-iops -a1 -d32 -j1 -e mq-deadline $sd >& ~/hi-pri.txt Result: * 11000 IOPS for the high-priority job * 40 IOPS for the low-priority job If the aging expiry time is changed from 100s into 0, the IOPS results change into 6712 and 6796 IOPS. The max-iops script is a script that runs fio with the following arguments: --bs=4K --gtod_reduce=1 --ioengine=libaio --ioscheduler=${arg_e} --runtime=60 --norandommap --rw=read --thread --buffered=0 --numjobs=${arg_j} --iodepth=${arg_d} --iodepth_batch_submit=${arg_a} --iodepth_batch_complete=$((arg_d / 2)) --name=${positional_argument_1} --filename=${positional_argument_1} Reviewed-by: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-17-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline-main.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 58a401ea8f56..4815e536091f 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -32,6 +32,11 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +/* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ +static const int aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ @@ -94,6 +99,7 @@ struct deadline_data { int writes_starved; int front_merges; u32 async_depth; + int aging_expire; spinlock_t lock; spinlock_t zone_lock; @@ -361,10 +367,11 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc + * read/write expire, fifo_batch, etc and with a start time <= @latest. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) + struct dd_per_prio *per_prio, + u64 latest_start_ns) { struct request *rq, *next_rq; enum dd_data_dir data_dir; @@ -377,6 +384,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); + if (rq->start_time_ns > latest_start_ns) + return NULL; list_del_init(&rq->queuelist); goto done; } @@ -454,6 +463,8 @@ dispatch_find_request: dd->batching = 0; dispatch_request: + if (rq->start_time_ns > latest_start_ns) + return NULL; /* * rq is the selected appropriate request. */ @@ -484,15 +495,32 @@ done: static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; + const u64 now_ns = ktime_get_ns(); + struct request *rq = NULL; enum dd_prio prio; spin_lock(&dd->lock); - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + /* + * Start with dispatching requests whose deadline expired more than + * aging_expire jiffies ago. + */ + for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns - + jiffies_to_nsecs(dd->aging_expire)); if (rq) + goto unlock; + } + /* + * Next, dispatch requests in priority order. Ignore lower priority + * requests if any higher priority requests are pending. + */ + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns); + if (rq || dd_queued(dd, prio)) break; } + +unlock: spin_unlock(&dd->lock); return rq; @@ -603,6 +631,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; + dd->aging_expire = aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); @@ -835,6 +864,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->front_merges); @@ -864,6 +894,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); @@ -882,6 +913,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), + DD_ATTR(aging_expire), __ATTR_NULL }; -- cgit v1.2.3 From 1d0903d61e9645c6330b94247b96dd873dfc11c8 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 19 Jun 2021 17:36:59 +0800 Subject: blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() Now that we disable wbt by simply zero out rwb->wb_normal in wbt_disable_default() when switch elevator to bfq, but it's not safe because it will become false positive if we change queue depth. If it become false positive between wbt_wait() and wbt_track() when submit write request, it will lead to drop rqw->inflight to -1 in wbt_done(), which will end up trigger IO hung. Fix this issue by introduce a new state which mean the wbt was disabled. Fixes: a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20210619093700.920393-2-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 +++-- block/blk-wbt.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index b363b0532704..0ce0883df3d6 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -77,7 +77,8 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { - return rwb && rwb->wb_normal != 0; + return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && + rwb->wb_normal != 0; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -701,7 +702,7 @@ void wbt_disable_default(struct request_queue *q) rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); - rwb->wb_normal = 0; + rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 16bdc85b8df9..2eb01becde8c 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -34,6 +34,7 @@ enum { enum { WBT_STATE_ON_DEFAULT = 1, WBT_STATE_ON_MANUAL = 2, + WBT_STATE_OFF_DEFAULT }; struct rq_wb { -- cgit v1.2.3 From 76a8040817b4b9c69b53f9b326987fa891b4082a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 19 Jun 2021 17:37:00 +0800 Subject: blk-wbt: make sure throttle is enabled properly After commit a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt"), if throttle was disabled by wbt_disable_default(), we could not enable again, fix this by set enable_state back to WBT_STATE_ON_DEFAULT. Fixes: a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20210619093700.920393-3-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0ce0883df3d6..3ed71b8da887 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -636,9 +636,13 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) void wbt_enable_default(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); + /* Throttling already enabled? */ - if (rqos) + if (rqos) { + if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; + } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) -- cgit v1.2.3 From 511a2699237611b062df7798476bf3a1392910b9 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:42 +0200 Subject: block, bfq: let also stably merged queues enjoy weight raising Merged bfq_queues are kept out of weight-raising (low-latency) mechanisms. The reason is that these queues are usually created for non-interactive and non-soft-real-time tasks. Yet this is not the case for stably-merged queues. These queues are merged just because they are created shortly after each other. So they may easily serve the I/O of an interactive or soft-real time application, if the application happens to spawn multiple processes. To address this issue, this commits lets also stably-merged queued enjoy weight raising. Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index acd1f881273e..da2363f12e53 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1729,10 +1729,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->entity.new_weight == 40; *interactive = !in_burst && idle_for_long_time && bfqq->entity.new_weight == 40; + /* + * Merged bfq_queues are kept out of weight-raising + * (low-latency) mechanisms. The reason is that these queues + * are usually created for non-interactive and + * non-soft-real-time tasks. Yet this is not the case for + * stably-merged queues. These queues are merged just because + * they are created shortly after each other. So they may + * easily serve the I/O of an interactive or soft-real time + * application, if the application happens to spawn multiple + * processes. So let also stably-merged queued enjoy weight + * raising. + */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); + (bfqq->bic || RQ_BIC(rq)->stably_merged) && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq -- cgit v1.2.3 From e03f2ab78a4a673e4af23c3b855591c48b9de4d7 Mon Sep 17 00:00:00 2001 From: Luca Mariotti Date: Sat, 19 Jun 2021 16:09:43 +0200 Subject: block, bfq: fix delayed stable merge check When attempting to schedule a merge of a given bfq_queue with the currently in-service bfq_queue or with a cooperating bfq_queue among the scheduled bfq_queues, delayed stable merge is checked for rotational or non-queueing devs. For this stable merge to be performed, some conditions must be met. If the current bfq_queue underwent some split from some merged bfq_queue, one of these conditions is that two hundred milliseconds must elapse from split, otherwise this condition is always met. Unfortunately, by mistake, time_is_after_jiffies() was written instead of time_is_before_jiffies() for this check, verifying that less than two hundred milliseconds have elapsed instead of verifying that at least two hundred milliseconds have elapsed. Fix this issue by replacing time_is_after_jiffies() with time_is_before_jiffies(). Signed-off-by: Luca Mariotti Signed-off-by: Paolo Valente Signed-off-by: Pietro Pedroni Link: https://lore.kernel.org/r/20210619140948.98712-3-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index da2363f12e53..c5c0e74977d4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2710,7 +2710,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (unlikely(!bfqd->nonrot_with_queueing)) { if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && - time_is_after_jiffies(bfqq->split_time + + time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(200))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; -- cgit v1.2.3 From d4f49983fa3944416c28379c35fbe10c68455ea4 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:44 +0200 Subject: block, bfq: consider also creation time in delayed stable merge Since commit 430a67f9d616 ("block, bfq: merge bursts of newly-created queues"), BFQ may schedule a merge between a newly created sync bfq_queue and the last sync bfq_queue created. Such a merging is not performed immediately, because BFQ needs first to find out whether the newly created queue actually reaches a higher throughput if not merged at all (and in that case BFQ will not perform any stable merging). To check that, a little time must be waited after the creation of the new queue, so that some I/O can flow in the queue, and statistics on such I/O can be computed. Yet, to evaluate the above waiting time, the last split time is considered as start time, instead of the creation time of the queue. This is a mistake, because considering the split time is correct only in the following scenario. The queue undergoes a non-stable merges on the arrival of its very first I/O request, due to close I/O with some other queue. While the queue is merged for close I/O, stable merging is not considered. Yet the queue may then happen to be split, if the close I/O finishes (or happens to be a false positive). From this time on, the queue can again be considered for stable merging. But, again, a little time must elapse, to let some new I/O flow in the queue and to get updated statistics. To wait for this time, the split time is to be taken into account. Yet, if the queue does not undergo a non-stable merge on the arrival of its very first request, then BFQ immediately checks whether the stable merge is to be performed. It happens because the split time for a queue is initialized to minus infinity when the queue is created. This commit fixes this mistake by adding the missing condition. Now the check for delayed stable-merge is performed after a little time is elapsed not only from the last queue split time, but also from the creation time of the queue. Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-4-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c5c0e74977d4..2a5c1a660f3b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2711,7 +2711,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + - msecs_to_jiffies(200))) { + msecs_to_jiffies(200)) && + time_is_before_jiffies(bfqq->creation_time + + msecs_to_jiffies(200))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; int proc_ref = min(bfqq_process_refs(bfqq), -- cgit v1.2.3 From 7812472f973047a886e4ed9a91d98d6627dd746f Mon Sep 17 00:00:00 2001 From: Pietro Pedroni Date: Sat, 19 Jun 2021 16:09:45 +0200 Subject: block, bfq: boost throughput by extending queue-merging times One of the methods with which bfq boosts throughput is by merging queues. One of the merging variants in bfq is the stable merge. This mechanism is activated between two queues only if they are created within a certain maximum time T1 from each other. Merging can happen soon or be delayed. In the second case, before merging, bfq needs to evaluate a throughput-boost parameter that indicates whether the queue generates a high throughput is served alone. Merging occurs when this throughput-boost is not high enough. In particular, this parameter is evaluated and late merging may occur only after at least a time T2 from the creation of the queue. Currently T1 and T2 are set to 180ms and 200ms, respectively. In this way the merging mechanism rarely occurs because time is not enough. This results in a noticeable lowering of the overall throughput with some workloads (see the example below). This commit introduces two constants bfq_activation_stable_merging and bfq_late_stable_merging in order to increase the duration of T1 and T2. Both the stable merging activation time and the late merging time are set to 600ms. This value has been experimentally evaluated using sqlite benchmark in the Phoronix Test Suite on a HDD. The duration of the benchmark before this fix was 111.02s, while now it has reached 97.02s, a better result than that of all the other schedulers. Signed-off-by: Pietro Pedroni Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-5-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2a5c1a660f3b..98a42ddb1760 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -364,6 +364,16 @@ static int ref_wr_duration[2]; */ static const unsigned long max_service_from_wr = 120000; +/* + * Maximum time between the creation of two queues, for stable merge + * to be activated (in ms) + */ +static const unsigned long bfq_activation_stable_merging = 600; +/* + * Minimum time to be waited before evaluating delayed stable merge (in ms) + */ +static const unsigned long bfq_late_stable_merging = 600; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -2711,9 +2721,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + - msecs_to_jiffies(200)) && + msecs_to_jiffies(bfq_late_stable_merging)) && time_is_before_jiffies(bfqq->creation_time + - msecs_to_jiffies(200))) { + msecs_to_jiffies(bfq_late_stable_merging))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; int proc_ref = min(bfqq_process_refs(bfqq), @@ -5494,7 +5504,7 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, */ if (!last_bfqq_created || time_before(last_bfqq_created->creation_time + - bfqd->bfq_burst_interval, + msecs_to_jiffies(bfq_activation_stable_merging), bfqq->creation_time) || bfqq->entity.parent != last_bfqq_created->entity.parent || bfqq->ioprio != last_bfqq_created->ioprio || -- cgit v1.2.3 From bd3664b362381c4c1473753ebedf0ab242a60d1d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:46 +0200 Subject: block, bfq: avoid delayed merge of async queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 430a67f9d616 ("block, bfq: merge bursts of newly-created queues"), BFQ may schedule a merge between a newly created sync bfq_queue, say Q2, and the last sync bfq_queue created, say Q1. To this goal, BFQ stores the address of Q1 in the field bic->stable_merge_bfqq of the bic associated with Q2. So, when the time for the possible merge arrives, BFQ knows which bfq_queue to merge Q2 with. In particular, BFQ checks for possible merges on request arrivals. Yet the same bic may also be associated with an async bfq_queue, say Q3. So, if a request for Q3 arrives, then the above check may happen to be executed while the bfq_queue at hand is Q3, instead of Q2. In this case, Q1 happens to be merged with an async bfq_queue. This is not only a conceptual mistake, because async queues are to be kept out of queue merging, but also a bug that leads to inconsistent states. This commits simply filters async queues out of delayed merges. Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Tested-by: Holger Hoffstätte Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-6-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 98a42ddb1760..7bf073ef9443 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2718,7 +2718,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * costly and complicated. */ if (unlikely(!bfqd->nonrot_with_queueing)) { - if (bic->stable_merge_bfqq && + /* + * Make sure also that bfqq is sync, because + * bic->stable_merge_bfqq may point to some queue (for + * stable merging) also if bic is associated with a + * sync queue, but this bfqq is async + */ + if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(bfq_late_stable_merging)) && -- cgit v1.2.3 From efc72524b3a9e4e7bc7c07f756528736409ec1b7 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:47 +0200 Subject: block, bfq: check waker only for queues with no in-flight I/O Consider two bfq_queues, say Q1 and Q2, with Q2 empty. If a request of Q1 gets completed shortly before a new request arrives for Q2, then BFQ flags Q1 as a candidate waker for Q2. Yet, the arrival of this new request may have a different cause, in the following case. If also Q2 has requests in flight while waiting for the arrival of a new request, then the completion of its own requests may be the actual cause of the awakening of the process that sends I/O to Q2. So Q1 may be flagged wrongly as a candidate waker. This commit avoids this deceptive flagging, by disabling candidate-waker flagging for Q2, if Q2 has in-flight I/O. Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-7-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7bf073ef9443..a273b2bcea2a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1985,14 +1985,18 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * Turning back to the detection of a waker queue, a queue Q is deemed * as a waker queue for bfqq if, for three consecutive times, bfqq * happens to become non empty right after a request of Q has been - * completed. In particular, on the first time, Q is tentatively set - * as a candidate waker queue, while on the third consecutive time - * that Q is detected, the field waker_bfqq is set to Q, to confirm - * that Q is a waker queue for bfqq. These detection steps are - * performed only if bfqq has a long think time, so as to make it more - * likely that bfqq's I/O is actually being blocked by a - * synchronization. This last filter, plus the above three-times - * requirement, make false positives less likely. + * completed. In this respect, even if bfqq is empty, we do not check + * for a waker if it still has some in-flight I/O. In fact, in this + * case bfqq is actually still being served by the drive, and may + * receive new I/O on the completion of some of the in-flight + * requests. In particular, on the first time, Q is tentatively set as + * a candidate waker queue, while on the third consecutive time that Q + * is detected, the field waker_bfqq is set to Q, to confirm that Q is + * a waker queue for bfqq. These detection steps are performed only if + * bfqq has a long think time, so as to make it more likely that + * bfqq's I/O is actually being blocked by a synchronization. This + * last filter, plus the above three-times requirement, make false + * positives less likely. * * NOTE * @@ -2018,6 +2022,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || + bfqq->dispatched > 0 || now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; -- cgit v1.2.3 From 9a2ac41b13c573703d6689f51f3e27dd658324be Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:48 +0200 Subject: block, bfq: reset waker pointer with shared queues Commit 85686d0dc194 ("block, bfq: keep shared queues out of the waker mechanism") leaves shared bfq_queues out of the waker-detection mechanism. It attains this goal by not updating the pointer last_completed_rq_bfqq, if the last request completed belongs to a shared bfq_queue (so that the pointer will not point to the shared bfq_queue). Yet this has a side effect: the pointer last_completed_rq_bfqq keeps pointing, deceptively, to a bfq_queue that actually is not the last one to have had a request completed. As a consequence, such a bfq_queue may deceptively be considered as a waker of some bfq_queue, even of some shared bfq_queue. To address this issue, reset last_completed_rq_bfqq if the last request completed belongs to a shared queue. Fixes: 85686d0dc194 ("block, bfq: keep shared queues out of the waker mechanism") Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-8-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a273b2bcea2a..fedb0a8fd388 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6165,11 +6165,13 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * of other queues. But a false waker will unjustly steal * bandwidth to its supposedly woken queue. So considering * also shared queues in the waking mechanism may cause more - * control troubles than throughput benefits. Then do not set - * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. + * control troubles than throughput benefits. Then reset + * last_completed_rq_bfqq if bfqq is a shared queue. */ if (!bfq_bfqq_coop(bfqq)) bfqd->last_completed_rq_bfqq = bfqq; + else + bfqd->last_completed_rq_bfqq = NULL; /* * If we are waiting to discover whether the request pattern -- cgit v1.2.3 From ddcc5c544eb0991501761622b651cf43ce660a22 Mon Sep 17 00:00:00 2001 From: Thomas Bracht Laumann Jespersen Date: Sat, 19 Jun 2021 21:51:31 +0200 Subject: block/partitions/msdos: Fix typo inidicator -> indicator Just a fix for a small typo in msdos_partition(). Signed-off-by: Thomas Bracht Laumann Jespersen Link: https://lore.kernel.org/r/20210619195130.19348-1-t@laumann.xyz Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 8f2fcc080264..63e4f6f8b6e9 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -622,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state) for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* - * Even without a valid boot inidicator value + * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ -- cgit v1.2.3 From 60b6a7e6a0f4382cd689f9afdac816964fec2921 Mon Sep 17 00:00:00 2001 From: Edward Hsieh Date: Thu, 24 Jun 2021 20:30:30 +0800 Subject: block: fix trace completion for chained bio For chained bio, trace_block_bio_complete in bio_endio is currently called only by the parent bio once upon all chained bio completed. However, the sector and size for the parent bio are modified in bio_split. Therefore, the size and sector of the complete events might not match the queue events in blktrace. The original fix of bio completion trace ("block: trace completion of all bios.") wants multiple complete events to correspond to one queue event but missed this. The issue can be reproduced by md/raid5 read with bio cross chunks. To fix, move trace completion into the loop for every chained bio to call. Fixes: fbbaf700e7b1 ("block: trace completion of all bios.") Reviewed-by: Wade Liang Reviewed-by: BingJing Chang Signed-off-by: Edward Hsieh Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123030.27014-1-edwardh@synology.com Signed-off-by: Jens Axboe --- block/bio.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 44205dfb6b60..1fab762e079b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1375,8 +1375,7 @@ static inline bool bio_remaining_done(struct bio *bio) * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the - * last time. At this point the BLK_TA_COMPLETE tracing event will be - * generated if BIO_TRACE_COMPLETION is set. + * last time. **/ void bio_endio(struct bio *bio) { @@ -1389,6 +1388,11 @@ again: if (bio->bi_bdev) rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that @@ -1402,11 +1406,6 @@ again: goto again; } - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); -- cgit v1.2.3 From d5870edfa3afc4608231267ea3b8e4beb3eab1ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 09:38:42 +0200 Subject: block: move the disk events code to a separate file Move the code for handling disk events from genhd.c into a new file as it isn't very related to the rest of the file while at the same time requiring lots of forward declarations. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210624073843.251178-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/blk.h | 5 + block/disk-events.c | 484 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 492 ---------------------------------------------------- 4 files changed, 491 insertions(+), 493 deletions(-) create mode 100644 block/disk-events.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index b9db5d4edfc8..bfbe4e13ca1e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ + disk-events.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/blk.h b/block/blk.h index d3fa47af3607..f8d726429906 100644 --- a/block/blk.h +++ b/block/blk.h @@ -360,4 +360,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); +void disk_alloc_events(struct gendisk *disk); +void disk_add_events(struct gendisk *disk); +void disk_del_events(struct gendisk *disk); +void disk_release_events(struct gendisk *disk); + #endif /* BLK_INTERNAL_H */ diff --git a/block/disk-events.c b/block/disk-events.c new file mode 100644 index 000000000000..1bc5dcb75e4e --- /dev/null +++ b/block/disk-events.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Disk events - monitor disk events like media change and eject request. + */ +#include +#include +#include +#include "blk.h" + +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + struct mutex block_mutex; /* protects blocking */ + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll if the POLL flag is set. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->event_flags & DISK_EVENT_FLAG_POLL) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + intv = disk_events_poll_jiffies(disk); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, false); +} + +/** + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush + * + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. + * + * CONTEXT: + * If @mask is non-zero must be called with disk->open_mutex held. + */ +void disk_flush_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = *clearing_ptr; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + *clearing_ptr &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT + * is set. Otherwise, events are processed internally but never + * get reported to userland. + */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if ((events & disk->events & (1 << i)) && + (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and cleared + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + unsigned int pending; + unsigned int clearing = mask; + + if (!ev) + return 0; + + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ + spin_lock_irq(&ev->lock); + clearing |= ev->clearing; + ev->clearing = 0; + spin_unlock_irq(&ev->lock); + + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); + + return pending; +} + +/** + * bdev_check_media_change - check if a removable media has been changed + * @bdev: block device to check + * + * Check whether a removable media has been changed, and attempt to free all + * dentries and inodes and invalidates all block device page cache entries in + * that case. + * + * Returns %true if the block device changed, or %false if not. + */ +bool bdev_check_media_change(struct block_device *bdev) +{ + unsigned int events; + + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + bdev->bd_disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return true; +} +EXPORT_SYMBOL(bdev_check_media_change); + +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * (always empty, only for backwards compatibility) + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + return 0; + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->ev) + return sprintf(buf, "-1\n"); + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + if (!disk->ev) + return -ENODEV; + + disk_block_events(disk); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + return count; +} + +static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, 0644, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/parameters/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + list_for_each_entry(ev, &disk_events, node) + disk_flush_events(ev->disk, 0); + mutex_unlock(&disk_events_mutex); + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. + */ +void disk_alloc_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !disk->events) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + disk->ev = ev; +} + +void disk_add_events(struct gendisk *disk) +{ + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + + if (!disk->ev) + return; + + mutex_lock(&disk_events_mutex); + list_add_tail(&disk->ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +void disk_del_events(struct gendisk *disk) +{ + if (disk->ev) { + disk_block_events(disk); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + } + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/genhd.c b/block/genhd.c index 5f5628216295..4f879deede9a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -33,13 +33,6 @@ static struct kobject *block_depr; #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr); -static void disk_alloc_events(struct gendisk *disk); -static void disk_add_events(struct gendisk *disk); -static void disk_del_events(struct gendisk *disk); -static void disk_release_events(struct gendisk *disk); - void set_capacity(struct gendisk *disk, sector_t sectors) { struct block_device *bdev = disk->part0; @@ -1367,488 +1360,3 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); - -/* - * Disk events - monitor disk events like media change and eject request. - */ -struct disk_events { - struct list_head node; /* all disk_event's */ - struct gendisk *disk; /* the associated disk */ - spinlock_t lock; - - struct mutex block_mutex; /* protects blocking */ - int block; /* event blocking depth */ - unsigned int pending; /* events already sent out */ - unsigned int clearing; /* events being cleared */ - - long poll_msecs; /* interval, -1 for default */ - struct delayed_work dwork; -}; - -static const char *disk_events_strs[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", -}; - -static char *disk_uevents[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", -}; - -/* list of all disk_events */ -static DEFINE_MUTEX(disk_events_mutex); -static LIST_HEAD(disk_events); - -/* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs; - -static unsigned long disk_events_poll_jiffies(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - long intv_msecs = 0; - - /* - * If device-specific poll interval is set, always use it. If - * the default is being used, poll if the POLL flag is set. - */ - if (ev->poll_msecs >= 0) - intv_msecs = ev->poll_msecs; - else if (disk->event_flags & DISK_EVENT_FLAG_POLL) - intv_msecs = disk_events_dfl_poll_msecs; - - return msecs_to_jiffies(intv_msecs); -} - -/** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - unsigned long flags; - bool cancel; - - if (!ev) - return; - - /* - * Outer mutex ensures that the first blocker completes canceling - * the event work before further blockers are allowed to finish. - */ - mutex_lock(&ev->block_mutex); - - spin_lock_irqsave(&ev->lock, flags); - cancel = !ev->block++; - spin_unlock_irqrestore(&ev->lock, flags); - - if (cancel) - cancel_delayed_work_sync(&disk->ev->dwork); - - mutex_unlock(&ev->block_mutex); -} - -static void __disk_unblock_events(struct gendisk *disk, bool check_now) -{ - struct disk_events *ev = disk->ev; - unsigned long intv; - unsigned long flags; - - spin_lock_irqsave(&ev->lock, flags); - - if (WARN_ON_ONCE(ev->block <= 0)) - goto out_unlock; - - if (--ev->block) - goto out_unlock; - - intv = disk_events_poll_jiffies(disk); - if (check_now) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - else if (intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); -out_unlock: - spin_unlock_irqrestore(&ev->lock, flags); -} - -/** - * disk_unblock_events - unblock disk event checking - * @disk: disk to unblock events for - * - * Undo disk_block_events(). When the block count reaches zero, it - * starts events polling if configured. - * - * CONTEXT: - * Don't care. Safe to call from irq context. - */ -void disk_unblock_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_unblock_events(disk, false); -} - -/** - * disk_flush_events - schedule immediate event checking and flushing - * @disk: disk to check and flush events for - * @mask: events to flush - * - * Schedule immediate event checking on @disk if not blocked. Events in - * @mask are scheduled to be cleared from the driver. Note that this - * doesn't clear the events from @disk->ev. - * - * CONTEXT: - * If @mask is non-zero must be called with disk->open_mutex held. - */ -void disk_flush_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - - if (!ev) - return; - - spin_lock_irq(&ev->lock); - ev->clearing |= mask; - if (!ev->block) - mod_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - spin_unlock_irq(&ev->lock); -} - -/** - * disk_clear_events - synchronously check, clear and return pending events - * @disk: disk to fetch and clear events from - * @mask: mask of events to be fetched and cleared - * - * Disk events are synchronously checked and pending events in @mask - * are cleared and returned. This ignores the block count. - * - * CONTEXT: - * Might sleep. - */ -static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - unsigned int pending; - unsigned int clearing = mask; - - if (!ev) - return 0; - - disk_block_events(disk); - - /* - * store the union of mask and ev->clearing on the stack so that the - * race with disk_flush_events does not cause ambiguity (ev->clearing - * can still be modified even if events are blocked). - */ - spin_lock_irq(&ev->lock); - clearing |= ev->clearing; - ev->clearing = 0; - spin_unlock_irq(&ev->lock); - - disk_check_events(ev, &clearing); - /* - * if ev->clearing is not 0, the disk_flush_events got called in the - * middle of this function, so we want to run the workfn without delay. - */ - __disk_unblock_events(disk, ev->clearing ? true : false); - - /* then, fetch and clear pending events */ - spin_lock_irq(&ev->lock); - pending = ev->pending & mask; - ev->pending &= ~mask; - spin_unlock_irq(&ev->lock); - WARN_ON_ONCE(clearing & mask); - - return pending; -} - -/** - * bdev_check_media_change - check if a removable media has been changed - * @bdev: block device to check - * - * Check whether a removable media has been changed, and attempt to free all - * dentries and inodes and invalidates all block device page cache entries in - * that case. - * - * Returns %true if the block device changed, or %false if not. - */ -bool bdev_check_media_change(struct block_device *bdev) -{ - unsigned int events; - - events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - bdev->bd_disk->disk_name); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - return true; -} -EXPORT_SYMBOL(bdev_check_media_change); - -/* - * Separate this part out so that a different pointer for clearing_ptr can be - * passed in for disk_clear_events. - */ -static void disk_events_workfn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct disk_events *ev = container_of(dwork, struct disk_events, dwork); - - disk_check_events(ev, &ev->clearing); -} - -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr) -{ - struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = *clearing_ptr; - unsigned int events; - unsigned long intv; - int nr_events = 0, i; - - /* check events */ - events = disk->fops->check_events(disk, clearing); - - /* accumulate pending events and schedule next poll if necessary */ - spin_lock_irq(&ev->lock); - - events &= ~ev->pending; - ev->pending |= events; - *clearing_ptr &= ~clearing; - - intv = disk_events_poll_jiffies(disk); - if (!ev->block && intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); - - spin_unlock_irq(&ev->lock); - - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); -} - -/* - * A disk events enabled device has the following sysfs nodes under - * its /sys/block/X/ directory. - * - * events : list of all supported events - * events_async : list of events which can be detected w/o polling - * (always empty, only for backwards compatibility) - * events_poll_msecs : polling interval, 0: disable, -1: system default - */ -static ssize_t __disk_events_show(unsigned int events, char *buf) -{ - const char *delim = ""; - ssize_t pos = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) - if (events & (1 << i)) { - pos += sprintf(buf + pos, "%s%s", - delim, disk_events_strs[i]); - delim = " "; - } - if (pos) - pos += sprintf(buf + pos, "\n"); - return pos; -} - -static ssize_t disk_events_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - return 0; - - return __disk_events_show(disk->events, buf); -} - -static ssize_t disk_events_async_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} - -static ssize_t disk_events_poll_msecs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!disk->ev) - return sprintf(buf, "-1\n"); - - return sprintf(buf, "%ld\n", disk->ev->poll_msecs); -} - -static ssize_t disk_events_poll_msecs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - long intv; - - if (!count || !sscanf(buf, "%ld", &intv)) - return -EINVAL; - - if (intv < 0 && intv != -1) - return -EINVAL; - - if (!disk->ev) - return -ENODEV; - - disk_block_events(disk); - disk->ev->poll_msecs = intv; - __disk_unblock_events(disk, true); - - return count; -} - -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; - -/* - * The default polling interval can be specified by the kernel - * parameter block.events_dfl_poll_msecs which defaults to 0 - * (disable). This can also be modified runtime by writing to - * /sys/module/block/parameters/events_dfl_poll_msecs. - */ -static int disk_events_set_dfl_poll_msecs(const char *val, - const struct kernel_param *kp) -{ - struct disk_events *ev; - int ret; - - ret = param_set_ulong(val, kp); - if (ret < 0) - return ret; - - mutex_lock(&disk_events_mutex); - - list_for_each_entry(ev, &disk_events, node) - disk_flush_events(ev->disk, 0); - - mutex_unlock(&disk_events_mutex); - - return 0; -} - -static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { - .set = disk_events_set_dfl_poll_msecs, - .get = param_get_ulong, -}; - -#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "block." - -module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, - &disk_events_dfl_poll_msecs, 0644); - -/* - * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. - */ -static void disk_alloc_events(struct gendisk *disk) -{ - struct disk_events *ev; - - if (!disk->fops->check_events || !disk->events) - return; - - ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) { - pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; - } - - INIT_LIST_HEAD(&ev->node); - ev->disk = disk; - spin_lock_init(&ev->lock); - mutex_init(&ev->block_mutex); - ev->block = 1; - ev->poll_msecs = -1; - INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); - - disk->ev = ev; -} - -static void disk_add_events(struct gendisk *disk) -{ - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - - if (!disk->ev) - return; - - mutex_lock(&disk_events_mutex); - list_add_tail(&disk->ev->node, &disk_events); - mutex_unlock(&disk_events_mutex); - - /* - * Block count is initialized to 1 and the following initial - * unblock kicks it into action. - */ - __disk_unblock_events(disk, true); -} - -static void disk_del_events(struct gendisk *disk) -{ - if (disk->ev) { - disk_block_events(disk); - - mutex_lock(&disk_events_mutex); - list_del_init(&disk->ev->node); - mutex_unlock(&disk_events_mutex); - } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); -} - -static void disk_release_events(struct gendisk *disk) -{ - /* the block count should be 1 from disk_del_events() */ - WARN_ON_ONCE(disk->ev && disk->ev->block != 1); - kfree(disk->ev); -} -- cgit v1.2.3 From 2bc8cda5ea4b42ff78be1b11011092d57b424d37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 09:38:43 +0200 Subject: block: add the events* attributes to disk_attrs Add the events attributes to the disk_attrs array, which ensures they are added by the driver core when the device is created rather than adding them after the device has been added, which is racy versus uevents and requires more boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210624073843.251178-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 3 +++ block/disk-events.c | 23 ++++------------------- block/genhd.c | 3 +++ 3 files changed, 10 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index f8d726429906..4fcd7a032377 100644 --- a/block/blk.h +++ b/block/blk.h @@ -364,5 +364,8 @@ void disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); +extern struct device_attribute dev_attr_events; +extern struct device_attribute dev_attr_events_async; +extern struct device_attribute dev_attr_events_poll_msecs; #endif /* BLK_INTERNAL_H */ diff --git a/block/disk-events.c b/block/disk-events.c index 1bc5dcb75e4e..a75931ff5da4 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -368,18 +368,10 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, return count; } -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; +DEVICE_ATTR(events, 0444, disk_events_show, NULL); +DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, + disk_events_poll_msecs_store); /* * The default polling interval can be specified by the kernel @@ -444,11 +436,6 @@ void disk_alloc_events(struct gendisk *disk) void disk_add_events(struct gendisk *disk) { - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - if (!disk->ev) return; @@ -472,8 +459,6 @@ void disk_del_events(struct gendisk *disk) list_del_init(&disk->ev->node); mutex_unlock(&disk_events_mutex); } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); } void disk_release_events(struct gendisk *disk) diff --git a/block/genhd.c b/block/genhd.c index 4f879deede9a..79aa40b4c39c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1022,6 +1022,9 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif -- cgit v1.2.3 From 630161cfdf5cdc696a82b59410d1ff00b23d946e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 14:32:39 +0200 Subject: block: move bdev_disk_changed Move bdev_disk_changed to block/partitions/core.c, together with the rest of the partition scanning code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123240.441814-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/block_dev.c | 53 ----------------------------------------------- include/linux/genhd.h | 1 - 3 files changed, 54 insertions(+), 55 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 186d4fbd9f09..b79785f7027c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -596,7 +596,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state; int ret = -EAGAIN, p; @@ -657,6 +657,59 @@ out_free_state: return ret; } +int bdev_disk_changed(struct block_device *bdev, bool invalidate) +{ + struct gendisk *disk = bdev->bd_disk; + int ret = 0; + + lockdep_assert_held(&disk->open_mutex); + + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + +rescan: + if (disk->open_partitions) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); + + clear_bit(GD_NEED_PART_SCAN, &disk->state); + + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk, bdev); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; +} +/* + * Only exported for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); + void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->bdev->bd_inode->i_mapping; diff --git a/fs/block_dev.c b/fs/block_dev.c index ac9b3c158a77..5b3a73ecb696 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1242,59 +1242,6 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } -int bdev_disk_changed(struct block_device *bdev, bool invalidate) -{ - struct gendisk *disk = bdev->bd_disk; - int ret = 0; - - lockdep_assert_held(&disk->open_mutex); - - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; - -rescan: - if (disk->open_partitions) - return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); - blk_drop_partitions(disk); - - clear_bit(GD_NEED_PART_SCAN, &disk->state); - - /* - * Historically we only set the capacity to zero for devices that - * support partitions (independ of actually having partitions created). - * Doing that is rather inconsistent, but changing it broke legacy - * udisks polling for legacy ide-cdrom devices. Use the crude check - * below to get the sane behavior for most device while not breaking - * userspace for this particular setup. - */ - if (invalidate) { - if (disk_part_scan_enabled(disk) || - !(disk->flags & GENHD_FL_REMOVABLE)) - set_capacity(disk, 0); - } - - if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); - if (ret == -EAGAIN) - goto rescan; - } else if (invalidate) { - /* - * Tell userspace that the media / partition table may have - * changed. - */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - } - - return ret; -} -/* - * Only exported for loop and dasd for historic reasons. Don't use in new - * code! - */ -EXPORT_SYMBOL_GPL(bdev_disk_changed); - static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03d684f0498f..f5f0c9bdf1d2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -257,7 +257,6 @@ static inline sector_t get_capacity(struct gendisk *disk) } int bdev_disk_changed(struct block_device *bdev, bool invalidate); -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); void blk_drop_partitions(struct gendisk *disk); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -- cgit v1.2.3 From 0384264ea8a39bd98c9a3158060565f650c056a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 14:32:40 +0200 Subject: block: pass a gendisk to bdev_disk_changed bdev_disk_changed can only operate on whole devices. Make that clear by passing a gendisk instead of the struct block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123240.441814-3-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 22 ++++++++++------------ drivers/block/loop.c | 21 ++++++++++----------- drivers/s390/block/dasd_genhd.c | 4 ++-- fs/block_dev.c | 4 ++-- include/linux/genhd.h | 2 +- 5 files changed, 25 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index b79785f7027c..347c56a51d87 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -120,8 +120,7 @@ static void free_partitions(struct parsed_partitions *state) kfree(state); } -static struct parsed_partitions *check_partition(struct gendisk *hd, - struct block_device *bdev) +static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; @@ -136,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, } state->pp_buf[0] = '\0'; - state->bdev = bdev; + state->bdev = hd->part0; disk_name(hd, 0, state->name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -546,7 +545,7 @@ void blk_drop_partitions(struct gendisk *disk) } } -static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, +static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; @@ -596,7 +595,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; @@ -604,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; - state = check_partition(disk, bdev); + state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { @@ -648,7 +647,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) - if (!blk_add_partition(disk, bdev, state, p)) + if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; @@ -657,9 +656,8 @@ out_free_state: return ret; } -int bdev_disk_changed(struct block_device *bdev, bool invalidate) +int bdev_disk_changed(struct gendisk *disk, bool invalidate) { - struct gendisk *disk = bdev->bd_disk; int ret = 0; lockdep_assert_held(&disk->open_mutex); @@ -670,8 +668,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) rescan: if (disk->open_partitions) return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); + sync_blockdev(disk->part0); + invalidate_bdev(disk->part0); blk_drop_partitions(disk); clear_bit(GD_NEED_PART_SCAN, &disk->state); @@ -691,7 +689,7 @@ rescan: } if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); + ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e90f7d349816..4fb1f9530d5a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -647,14 +647,13 @@ static inline void loop_update_dio(struct loop_device *lo) lo->use_dio); } -static void loop_reread_partitions(struct loop_device *lo, - struct block_device *bdev) +static void loop_reread_partitions(struct loop_device *lo) { int rc; - mutex_lock(&bdev->bd_disk->open_mutex); - rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&lo->lo_disk->open_mutex); + rc = bdev_disk_changed(lo->lo_disk, false); + mutex_unlock(&lo->lo_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -752,7 +751,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, */ fput(old_file); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); return 0; out_err: @@ -1174,7 +1173,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, bdgrab(bdev); mutex_unlock(&lo->lo_mutex); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); return 0; @@ -1268,10 +1267,10 @@ out_unlock: * current holder is released. */ if (!release) - mutex_lock(&bdev->bd_disk->open_mutex); - err = bdev_disk_changed(bdev, false); + mutex_lock(&lo->lo_disk->open_mutex); + err = bdev_disk_changed(lo->lo_disk, false); if (!release) - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo_number, err); @@ -1416,7 +1415,7 @@ out_unfreeze: out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); return err; } diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index bf2082d461c7..493e8469893c 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -110,7 +110,7 @@ int dasd_scan_partitions(struct dasd_block *block) } mutex_lock(&block->gdp->open_mutex); - rc = bdev_disk_changed(bdev, false); + rc = bdev_disk_changed(block->gdp, false); mutex_unlock(&block->gdp->open_mutex); if (rc) DBF_DEV_EVENT(DBF_ERR, block->base, @@ -146,7 +146,7 @@ void dasd_destroy_partitions(struct dasd_block *block) block->bdev = NULL; mutex_lock(&bdev->bd_disk->open_mutex); - bdev_disk_changed(bdev, true); + bdev_disk_changed(bdev->bd_disk, true); mutex_unlock(&bdev->bd_disk->open_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 5b3a73ecb696..34253d155f5c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1253,7 +1253,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(bdev, true); + bdev_disk_changed(disk, true); return ret; } } @@ -1264,7 +1264,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(bdev, false); + bdev_disk_changed(disk, false); bdev->bd_openers++; return 0;; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f5f0c9bdf1d2..13b34177cc85 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -256,7 +256,7 @@ static inline sector_t get_capacity(struct gendisk *disk) return bdev_nr_sectors(disk->part0); } -int bdev_disk_changed(struct block_device *bdev, bool invalidate); +int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -- cgit v1.2.3 From a921c655f2033dd1ce1379128efe881dda23ea37 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jun 2021 11:36:33 +0200 Subject: bfq: Remove merged request already in bfq_requests_merged() Currently, bfq does very little in bfq_requests_merged() and handles all the request cleanup in bfq_finish_requeue_request() called from blk_mq_free_request(). That is currently safe only because blk_mq_free_request() is called shortly after bfq_requests_merged() while bfqd->lock is still held. However to fix a lock inversion between bfqd->lock and ioc->lock, we need to call blk_mq_free_request() after dropping bfqd->lock. That would mean that already merged request could be seen by other processes inside bfq queues and possibly dispatched to the device which is wrong. So move cleanup of the request from bfq_finish_requeue_request() to bfq_requests_merged(). Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-2-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fedb0a8fd388..9433d38e486c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2433,7 +2433,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2456,6 +2456,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ @@ -6414,6 +6422,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -6431,39 +6440,15 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows -- cgit v1.2.3 From fd2ef39cc9a6b9c4c41864ac506906c52f94b06a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jun 2021 11:36:34 +0200 Subject: blk: Fix lock inversion between ioc lock and bfqd lock Lockdep complains about lock inversion between ioc->lock and bfqd->lock: bfqd -> ioc: put_io_context+0x33/0x90 -> ioc->lock grabbed blk_mq_free_request+0x51/0x140 blk_put_request+0xe/0x10 blk_attempt_req_merge+0x1d/0x30 elv_attempt_insert_merge+0x56/0xa0 blk_mq_sched_try_insert_merge+0x4b/0x60 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed blk_mq_sched_insert_requests+0xd6/0x2b0 blk_mq_flush_plug_list+0x154/0x280 blk_finish_plug+0x40/0x60 ext4_writepages+0x696/0x1320 do_writepages+0x1c/0x80 __filemap_fdatawrite_range+0xd7/0x120 sync_file_range+0xac/0xf0 ioc->bfqd: bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed put_io_context_active+0x78/0xb0 -> ioc->lock grabbed exit_io_context+0x48/0x50 do_exit+0x7e9/0xdd0 do_group_exit+0x54/0xc0 To avoid this inversion we change blk_mq_sched_try_insert_merge() to not free the merged request but rather leave that upto the caller similarly to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure to free all the merged requests after dropping bfqd->lock. Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Reviewed-by: Ming Lei Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- block/blk-merge.c | 19 ++++++++----------- block/blk-mq-sched.c | 5 +++-- block/blk-mq-sched.h | 3 ++- block/blk-mq.h | 11 +++++++++++ block/blk.h | 2 +- block/elevator.c | 11 ++++++++--- block/mq-deadline-main.c | 5 ++++- include/linux/elevator.h | 3 ++- 9 files changed, 43 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9433d38e486c..727955918563 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2345,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -5969,14 +5969,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 4d97fb6dd226..1398b52a24b4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -846,18 +846,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2403a5c2b053..c838d81ac058 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -399,9 +399,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index aff037cfd8e7..5246ae040704 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -13,7 +13,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-mq.h b/block/blk-mq.h index 4b1ca7b7bbeb..d08779f77a26 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -302,6 +302,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk.h b/block/blk.h index 4fcd7a032377..4b885c0f6708 100644 --- a/block/blk.h +++ b/block/blk.h @@ -224,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/elevator.c b/block/elevator.c index 85d0d4adbb64..52ada14cfe45 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 4815e536091f..9db6da9ef4c6 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -719,6 +719,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct dd_per_prio *per_prio; enum dd_prio prio; struct dd_blkcg *blkcg; + LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -742,8 +743,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, WARN_ON_ONCE(rq->elv.priv[0]); rq->elv.priv[0] = blkcg; - if (blk_mq_sched_try_insert_merge(q, rq)) + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + } trace_block_rq_insert(rq); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 783ecb3cb77a..ef9ceead3db1 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); void elevator_init_mq(struct request_queue *q); -- cgit v1.2.3 From cb9516be7708a2a18ec0a19fe3a225b5b3bc92c7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Jun 2021 10:02:48 +0800 Subject: blk-mq: update hctx->dispatch_busy in case of real scheduler Commit 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") starts to support io batching submission by using hctx->dispatch_busy. However, blk_mq_update_dispatch_busy() isn't changed to update hctx->dispatch_busy in that commit, so fix the issue by updating hctx->dispatch_busy in case of real scheduler. Reported-by: Jan Kara Reviewed-by: Jan Kara Fixes: 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210625020248.1630497-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3115ea2d0990..c2f3550337f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; - if (hctx->queue->elevator) - return; - ewma = hctx->dispatch_busy; if (!ewma && !busy) -- cgit v1.2.3 From c06bc5a3fb42304d815a2dc41e324b5a97c9f7da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 27 Jun 2021 14:11:12 -0700 Subject: block/mq-deadline: Remove a WARN_ON_ONCE() call The purpose of the WARN_ON_ONCE() statement in dd_insert_request() is to verify that dd_prepare_request() cleared rq->elv.priv[0]. Since dd_prepare_request() is called during request initialization but not if a request is requeued, a warning is triggered if a request is requeued. Fix this by removing the WARN_ON_ONCE() statement. This patch suppresses the following kernel warning: WARNING: CPU: 28 PID: 432 at block/mq-deadline-main.c:740 dd_insert_request+0x4d4/0x5b0 Workqueue: kblockd blk_mq_requeue_work Call Trace: dd_insert_requests+0xfa/0x130 blk_mq_sched_insert_request+0x22c/0x240 blk_mq_requeue_work+0x21c/0x2d0 process_one_work+0x4c2/0xa70 worker_thread+0x2e5/0x6d0 kthread+0x21c/0x250 ret_from_fork+0x1f/0x30 Reported-by: Sachin Sant Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210627211112.12720-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline-main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 9db6da9ef4c6..6f612e6dc82b 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -740,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, dd_count(dd, inserted, prio); blkcg = dd_blkcg_from_bio(rq->bio); ddcg_count(blkcg, inserted, ioprio_class); - WARN_ON_ONCE(rq->elv.priv[0]); rq->elv.priv[0] = blkcg; if (blk_mq_sched_try_insert_merge(q, rq, &free)) { -- cgit v1.2.3 From 2705dfb2094777e405e065105e307074af8965c1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 28 Jun 2021 10:33:12 +0800 Subject: block: fix discard request merge ll_new_hw_segment() is reached only in case of single range discard merge, and we don't have max discard segment size limit actually, so it is wrong to run the following check: if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) it may be always false since req->nr_phys_segments is initialized as one, and bio's segment count is still 1, blk_rq_get_max_segments(reg) is 1 too. Fix the issue by not doing the check and bypassing the calculation of discard request's nr_phys_segments. Based on analysis from Wang Shanker. Cc: Christoph Hellwig Reported-by: Wang Shanker Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210628023312.1903255-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 1398b52a24b4..a11b3b53717e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -559,10 +559,14 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { - if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; - if (blk_integrity_merge_bio(req->q, req, bio) == false) + /* discard request merge won't add new segment */ + if (req_op(req) == REQ_OP_DISCARD) + return 1; + + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) goto no_merge; /* -- cgit v1.2.3