From b5f74ecacc3139ef873e69acc3aba28083ecc416 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:43 +0100 Subject: block, bfq: use half slice_idle as a threshold to check short ttime The value of the I/O plugging (idling) timeout is used also as the think-time threshold to decide whether a process has a short think time. In this respect, a good value of this timeout for rotational drives is un the order of several ms. Yet, this is often too long a time interval to be effective as a think-time threshold. This commit mitigates this problem (by a lot, according to tests), by halving the threshold. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e4eb0fc1c16..eb2ca32d5b63 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5238,12 +5238,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime + * bfqq. Otherwise check average think time to decide whether + * to mark as has_short_ttime. To this goal, compare average + * think time with half the I/O-plugging timeout. */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); -- cgit v1.2.3 From d4fc3640ff361a09e359867e0bca898abd2b7ecb Mon Sep 17 00:00:00 2001 From: Jia Cheng Hu Date: Fri, 22 Jan 2021 19:19:44 +0100 Subject: block, bfq: set next_rq to waker_bfqq->next_rq in waker injection Since commit c5089591c3ba ("block, bfq: detect wakers and unconditionally inject their I/O"), when the in-service bfq_queue, say Q, is temporarily empty, BFQ checks whether there are I/O requests to inject (also) from the waker bfq_queue for Q. To this goal, the value pointed by bfqq->waker_bfqq->next_rq must be controlled. However, the current implementation mistakenly looks at bfqq->next_rq, which instead points to the next request of the currently served queue. This mistake evidently causes losses of throughput in scenarios with waker bfq_queues. This commit corrects this mistake. Fixes: c5089591c3ba ("block, bfq: detect wakers and unconditionally inject their I/O") Signed-off-by: Jia Cheng Hu Signed-off-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index eb2ca32d5b63..fdc5e163b2fe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4499,7 +4499,7 @@ check_queue: bfqq = bfqq->bic->bfqq[0]; else if (bfq_bfqq_has_waker(bfqq) && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->next_rq && + bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) -- cgit v1.2.3 From ab1fb47e33dc7754a7593181ffe0742c7105ea9a Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:45 +0100 Subject: block, bfq: increase time window for waker detection Tests on slower machines showed current window to be way too small. This commit increases it. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fdc5e163b2fe..43e2c39cf7b5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1931,7 +1931,7 @@ static void bfq_add_request(struct request *rq) if (bfqd->last_completed_rq_bfqq && !bfq_bfqq_has_short_ttime(bfqq) && ktime_get_ns() - bfqd->last_completion < - 200 * NSEC_PER_USEC) { + 4 * NSEC_PER_MSEC) { if (bfqd->last_completed_rq_bfqq != bfqq && bfqd->last_completed_rq_bfqq != bfqq->waker_bfqq) { -- cgit v1.2.3 From 91b896f65d32610d6d58af02170b15f8d37a7702 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:46 +0100 Subject: block, bfq: do not raise non-default weights BFQ heuristics try to detect interactive I/O, and raise the weight of the queues containing such an I/O. Yet, if also the user changes the weight of a queue (i.e., the user changes the ioprio of the process associated with that queue), then it is most likely better to prevent BFQ heuristics from silently changing the same weight. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 43e2c39cf7b5..161badb744d6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1671,15 +1671,19 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense). + * - is linked to a bfq_io_cq (it is not shared in any sense), + * - has a default weight (otherwise we assume the user wanted + * to control its weight explicitly) */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0; - *interactive = !in_burst && idle_for_long_time; + bfqq->dispatched == 0 && + bfqq->entity.new_weight == 40; + *interactive = !in_burst && idle_for_long_time && + bfqq->entity.new_weight == 40; wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && -- cgit v1.2.3 From 3c337690d2ebb7a01fa13bfa59ce4911f358df42 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:47 +0100 Subject: block, bfq: avoid spurious switches to soft_rt of interactive queues BFQ tags some bfq_queues as interactive or soft_rt if it deems that these bfq_queues contain the I/O of, respectively, interactive or soft real-time applications. BFQ privileges both these special types of bfq_queues over normal bfq_queues. To privilege a bfq_queue, BFQ mainly raises the weight of the bfq_queue. In particular, soft_rt bfq_queues get a higher weight than interactive bfq_queues. A bfq_queue may turn from interactive to soft_rt. And this leads to a tricky issue. Soft real-time applications usually start with an I/O-bound, interactive phase, in which they load themselves into main memory. BFQ correctly detects this phase, and keeps the bfq_queues associated with the application in interactive mode for a while. Problems arise when the I/O pattern of the application finally switches to soft real-time. One of the conditions for a bfq_queue to be deemed as soft_rt is that the bfq_queue does not consume too much bandwidth. But the bfq_queues associated with a soft real-time application consume as much bandwidth as they can in the loading phase of the application. So, after the application becomes truly soft real-time, a lot of time should pass before the average bandwidth consumed by its bfq_queues finally drops to a value acceptable for soft_rt bfq_queues. As a consequence, there might be a time gap during which the application is not privileged at all, because its bfq_queues are not interactive any longer, but cannot be deemed as soft_rt yet. To avoid this problem, BFQ pretends that an interactive bfq_queue consumes zero bandwidth, and allows an interactive bfq_queue to switch to soft_rt. Yet, this fake zero-bandwidth consumption easily causes the bfq_queue to often switch to soft_rt deceptively, during its loading phase. As in soft_rt mode, the bfq_queue gets its bandwidth correctly computed, and therefore soon switches back to interactive. Then it switches again to soft_rt, and so on. These spurious fluctuations usually cause losses of throughput, because they deceive BFQ's mechanisms for boosting throughput (injection, I/O-plugging avoidance, ...). This commit addresses this issue as follows: 1) It does compute actual bandwidth consumption also for interactive bfq_queues. This avoids the above false positives. 2) When a bfq_queue switches from interactive to normal mode, the consumed bandwidth is reset (forgotten). This allows the bfq_queue to enjoy soft_rt very quickly. In particular, two alternatives are possible in this switch: - the bfq_queue still has backlog, and therefore there is a budget already scheduled to serve the bfq_queue; in this case, the scheduling of the current budget of the bfq_queue is not hindered, because only the scheduling of the next budget will be affected by the weight drop. After that, if the bfq_queue is actually in a soft_rt phase, and becomes empty during the service of its current budget, which is the natural behavior of a soft_rt bfq_queue, then the bfq_queue will be considered as soft_rt when its next I/O arrives. If, in contrast, the bfq_queue remains constantly non-empty, then its next budget will be scheduled with a low weight, which is the natural treatment for an I/O-bound (non soft_rt) bfq_queue. - the bfq_queue is empty; in this case, the bfq_queue may be considered unjustly soft_rt when its new I/O arrives. Yet the problem is now much smaller than before, because it is unlikely that more than one spurious fluctuation occurs. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 57 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 161badb744d6..003c96fa01ad 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2356,6 +2356,24 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { + /* + * If bfqq has been enjoying interactive weight-raising, then + * reset soft_rt_next_start. We do it for the following + * reason. bfqq may have been conveying the I/O needed to load + * a soft real-time application. Such an application actually + * exhibits a soft real-time I/O pattern after it finishes + * loading, and finally starts doing its job. But, if bfqq has + * been receiving a lot of bandwidth so far (likely to happen + * on a fast device), then soft_rt_next_start now contains a + * high value that. So, without this reset, bfqq would be + * prevented from being possibly considered as soft_rt for a + * very long time. + */ + + if (bfqq->wr_cur_max_time != + bfqq->bfqd->bfq_wr_rt_max_time) + bfqq->soft_rt_next_start = jiffies; + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -3956,30 +3974,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. And we do it, unless bfqq is in - * interactive weight raising. We do not do it in the - * latter subcase, for the following reason. bfqq may - * be conveying the I/O needed to load a soft - * real-time application. Such an application will - * actually exhibit a soft real-time I/O pattern after - * it finally starts doing its job. But, if - * soft_rt_next_start is computed here for an - * interactive bfqq, and bfqq had received a lot of - * service before remaining with no outstanding - * request (likely to happen on a fast device), then - * soft_rt_next_start would be assigned such a high - * value that, for a very long time, bfqq would be - * prevented from being possibly considered as soft - * real time. + * bfq_bfqq_softrt_next_start()). Therefore we can + * compute soft_rt_next_start. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0 && - bfqq->wr_coeff != bfqd->bfq_wr_coeff) + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4563,9 +4566,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) + bfq_wr_duration(bfqd))) { + /* + * Either in interactive weight + * raising, or in soft_rt weight + * raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ bfq_bfqq_end_wr(bfqq); - else { + } else { /* + * soft_rt finishing while still in + * interactive period, switch back to + * interactive weight raising + */ switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -5016,6 +5031,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", + bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } -- cgit v1.2.3 From 2391d13ed484df1515f0025458e1f82317823fab Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 22 Jan 2021 19:19:48 +0100 Subject: block, bfq: do not expire a queue when it is the only busy one This commits preserves I/O-dispatch plugging for a special symmetric case that may suddenly turn into asymmetric: the case where only one bfq_queue, say bfqq, is busy. In this case, not expiring bfqq does not cause any harm to any other queues in terms of service guarantees. In contrast, it avoids the following unlucky sequence of events: (1) bfqq is expired, (2) a new queue with a lower weight than bfqq becomes busy (or more queues), (3) the new queue is served until a new request arrives for bfqq, (4) when bfqq is finally served, there are so many requests of the new queue in the drive that the pending requests for bfqq take a lot of time to be served. In particular, event (2) may case even already dispatched requests of bfqq to be delayed, inside the drive. So, to avoid this series of events, the scenario is preventively declared as asymmetric also if bfqq is the only busy queues. By doing so, I/O-dispatch plugging is performed for bfqq. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 003c96fa01ad..c045613ce927 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3464,20 +3464,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. + * + * However, as an additional mitigation for this problem, we preserve + * plugging for a special symmetric case that may suddenly turn into + * asymmetric: the case where only bfqq is busy. In this case, not + * expiring bfqq does not cause any harm to any other queues in terms + * of service guarantees. In contrast, it avoids the following unlucky + * sequence of events: (1) bfqq is expired, (2) a new queue with a + * lower weight than bfqq becomes busy (or more queues), (3) the new + * queue is served until a new request arrives for bfqq, (4) when bfqq + * is finally served, there are so many requests of the new queue in + * the drive that the pending requests for bfqq take a lot of time to + * be served. In particular, event (2) may case even already + * dispatched requests of bfqq to be delayed, inside the drive. So, to + * avoid this series of events, the scenario is preventively declared + * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + int tot_busy_queues = bfq_tot_busy_queues(bfqd); + /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd) || + tot_busy_queues || bfqd->rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq); + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- cgit v1.2.3 From 5ac83c644f5fb924f0b2c09102ab82fc788f8411 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jan 2021 17:47:16 +0100 Subject: Revert "blk-mq, elevator: Count requests per hctx to improve performance" This reverts commit b445547ec1bbd3e7bf4b1c142550942f70527d95. Since both mq-deadline and BFQ completely ignore hctx they are passed to their dispatch function and dispatch whatever request they deem fit checking whether any request for a particular hctx is queued is just pointless since we'll very likely get a request from a different hctx anyway. In the following commit we'll deal with lock contention in these IO schedulers in presence of multiple HW queues in a different way. Signed-off-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 ----- block/blk-mq.c | 1 - block/mq-deadline.c | 6 ------ include/linux/blk-mq.h | 4 ---- 4 files changed, 16 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c045613ce927..b12a416b51d7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4677,9 +4677,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -5597,7 +5594,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } } @@ -5965,7 +5961,6 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); - atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { diff --git a/block/blk-mq.c b/block/blk-mq.c index 74b17b396f4c..1af6b8a9da5a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2653,7 +2653,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); - atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 800ac902809b..b57470e154c8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); - if (rq) - atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } @@ -535,7 +533,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -582,9 +579,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6b410dab48ee..aabbf6830ffc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,10 +140,6 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; - /** - * @elevator_queued: Number of queued requests on hctx. - */ - atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; -- cgit v1.2.3 From eb2fd80f9d2c515a901599362e83bc3356fc5e97 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:43 +0100 Subject: block, bfq: replace mechanism for evaluating I/O intensity Some BFQ mechanisms make their decisions on a bfq_queue basing also on whether the bfq_queue is I/O bound. In this respect, the current logic for evaluating whether a bfq_queue is I/O bound is rather rough. This commits replaces this logic with a more effective one. The new logic measures the percentage of time during which a bfq_queue is active, and marks the bfq_queue as I/O bound if the latter if this percentage is above a fixed threshold. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 63 +++++++++++++++++++++++++++++++++++++---------------- block/bfq-iosched.h | 16 +++++++------- 2 files changed, 52 insertions(+), 27 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b12a416b51d7..375e35c3d2fb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1026,6 +1026,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; + bfqq->io_start_time = bic->saved_io_start_time; + bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; @@ -1721,17 +1723,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1865,6 +1856,36 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } +static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) +{ + u64 tot_io_time = now_ns - bfqq->io_start_time; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) + bfqq->tot_idle_time += + now_ns - bfqq->ttime.last_end_request; + + if (unlikely(bfq_bfqq_just_created(bfqq))) + return; + + /* + * Must be busy for at least about 80% of the time to be + * considered I/O bound. + */ + if (bfqq->tot_idle_time * 5 > tot_io_time) + bfq_clear_bfqq_IO_bound(bfqq); + else + bfq_mark_bfqq_IO_bound(bfqq); + + /* + * Keep an observation window of at most 200 ms in the past + * from now. + */ + if (tot_io_time > 200 * NSEC_PER_MSEC) { + bfqq->io_start_time = now_ns - (tot_io_time>>1); + bfqq->tot_idle_time >>= 1; + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1872,6 +1893,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; + u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -1934,7 +1956,7 @@ static void bfq_add_request(struct request *rq) */ if (bfqd->last_completed_rq_bfqq && !bfq_bfqq_has_short_ttime(bfqq) && - ktime_get_ns() - bfqd->last_completion < + now_ns - bfqd->last_completion < 4 * NSEC_PER_MSEC) { if (bfqd->last_completed_rq_bfqq != bfqq && bfqd->last_completed_rq_bfqq != @@ -2051,6 +2073,9 @@ static void bfq_add_request(struct request *rq) } } + if (bfq_bfqq_sync(bfqq)) + bfq_update_io_intensity(bfqq, now_ns); + elv_rb_add(&bfqq->sort_list, rq); /* @@ -2712,6 +2737,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_io_start_time = bfqq->io_start_time; + bic->saved_tot_idle_time = bfqq->tot_idle_time; bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && @@ -3979,10 +4006,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (reason == BFQQE_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -5085,6 +5108,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync) { + u64 now_ns = ktime_get_ns(); + RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5112,7 +5137,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = now_ns + 1; + + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -6524,8 +6551,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 703895224562..c913b06016b3 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -291,6 +291,11 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* when bfqq started to do I/O within the last observation window */ + u64 io_start_time; + /* how long bfqq has remained empty during the last observ. window */ + u64 tot_idle_time; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -407,6 +412,9 @@ struct bfq_io_cq { */ bool saved_IO_bound; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Same purpose as the previous fields for the value of the * field keeping the queue's belonging to a large burst @@ -641,14 +649,6 @@ struct bfq_data { */ unsigned int bfq_timeout; - /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput -- cgit v1.2.3 From 7f1995c27b19060dbdff23442f375e3097c90707 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:44 +0100 Subject: block, bfq: re-evaluate convenience of I/O plugging on rq arrivals Upon an I/O-dispatch attempt, BFQ may detect that it was better to plug I/O dispatch, and to wait for a new request to arrive for the currently in-service queue. But the arrival of a new request for an empty bfq_queue, and thus the switch from idle to busy of the bfq_queue, may cause the scenario to change, and make plugging no longer needed for service guarantees, or more convenient for throughput. In this case, keeping I/O-dispatch plugged would certainly lower throughput. To address this issue, this commit makes such a check, and stops plugging I/O if it is better to stop plugging I/O. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 375e35c3d2fb..44c6433b5b25 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1649,6 +1649,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +static bool bfq_better_to_idle(struct bfq_queue *bfqq); + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1750,10 +1752,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_add_bfqq_busy(bfqd, bfqq); /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In particular, we care only about two - * cases. The first is that bfqq has to recover a service - * hole, as explained in the comments on + * Expire in-service queue if preemption may be needed for + * guarantees or throughput. As for guarantees, we care + * explicitly about two cases. The first is that bfqq has to + * recover a service hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1780,11 +1782,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). + * + * As for throughput, we ask bfq_better_to_idle() whether we + * still need to plug I/O dispatching. If bfq_better_to_idle() + * says no, then plugging is not needed any longer, either to + * boost throughput or to perserve service guarantees. Then + * the best option is to stop plugging I/O, as not doing so + * would certainly lower throughput. We may end up in this + * case if: (1) upon a dispatch attempt, we detected that it + * was better to plug I/O dispatch, and to wait for a new + * request to arrive for the currently in-service queue, but + * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || + !bfq_better_to_idle(bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); -- cgit v1.2.3 From d1f600fa4732dac36c71a03b790f0c829a076475 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:45 +0100 Subject: block, bfq: fix switch back from soft-rt weitgh-raising A bfq_queue may happen to be deemed as soft real-time while it is still enjoying interactive weight-raising. If this happens because of a false positive, then the bfq_queue is likely to loose its soft real-time status soon. Upon losing such a status, the bfq_queue must get back its interactive weight-raising, if its interactive period is not over yet. But this case is not handled. This commit corrects this error. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 44c6433b5b25..170aa0ccc121 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5290,8 +5290,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) - bfq_bfqq_end_wr(bfqq); + BFQQ_TOTALLY_SEEKY(bfqq)) { + if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + /* + * In soft_rt weight raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ + bfq_bfqq_end_wr(bfqq); + } else { /* + * stopping soft_rt weight raising + * while still in interactive period, + * switch back to interactive weight + * raising + */ + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -- cgit v1.2.3 From e673914d52f913584cc4c454dfcff2e8eb04533f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:46 +0100 Subject: block, bfq: save also weight-raised service on queue merging To prevent weight-raising information from being lost on bfq_queue merging, also the amount of service that a bfq_queue receives must be saved and restored when the bfq_queue is merged and split, respectively. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ block/bfq-iosched.h | 1 + 2 files changed, 3 insertions(+) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 170aa0ccc121..5d48cba07cb5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1029,6 +1029,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->io_start_time = bic->saved_io_start_time; bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; @@ -2775,6 +2776,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_wr_coeff = bfqq->wr_coeff; bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_service_from_wr = bfqq->service_from_wr; bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index c913b06016b3..d15299d59f89 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -440,6 +440,7 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; + unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; -- cgit v1.2.3 From 5a5436b98d5cd2714feaaa579cec49dd7f7057bb Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:47 +0100 Subject: block, bfq: save also injection state on queue merging To prevent injection information from being lost on bfq_queue merging, also the amount of service that a bfq_queue receives must be saved and restored when the bfq_queue is merged and split, respectively. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++++++ block/bfq-iosched.h | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5d48cba07cb5..79d232d41027 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1024,6 +1024,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); + bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; + bfqq->inject_limit = bic->saved_inject_limit; + bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; bfqq->io_start_time = bic->saved_io_start_time; @@ -2748,6 +2752,10 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; + bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bic->saved_inject_limit = bfqq->inject_limit; + bic->saved_decrease_time_jif = bfqq->decrease_time_jif; + bic->saved_weight = bfqq->entity.orig_weight; bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index d15299d59f89..3f350fa3c5fd 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -444,6 +444,11 @@ struct bfq_io_cq { unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + + /* Save also injection state */ + u64 saved_last_serv_time_ns; + unsigned int saved_inject_limit; + unsigned long saved_decrease_time_jif; }; /** -- cgit v1.2.3 From 71217df39dc67a0aeed83352b0d712b7892036a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jan 2021 20:02:48 +0100 Subject: block, bfq: make waker-queue detection more robust In the presence of many parallel I/O flows, the detection of waker bfq_queues suffers from false positives. This commits addresses this issue by making the filtering of actual wakers more selective. In more detail, a candidate waker must be found to meet waker requirements three times before being promoted to actual waker. Tested-by: Jan Kara Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 211 +++++++++++++++++++++++++--------------------------- block/bfq-iosched.h | 7 +- 2 files changed, 108 insertions(+), 110 deletions(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 79d232d41027..eaeda18cb8c8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -158,7 +158,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1905,6 +1904,107 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) } } +/* + * Detect whether bfqq's I/O seems synchronized with that of some + * other queue, i.e., whether bfqq, after remaining empty, happens to + * receive new I/O only right after some I/O request of the other + * queue has been completed. We call waker queue the other queue, and + * we assume, for simplicity, that bfqq may have at most one waker + * queue. + * + * A remarkable throughput boost can be reached by unconditionally + * injecting the I/O of the waker queue, every time a new + * bfq_dispatch_request happens to be invoked while I/O is being + * plugged for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth and latency for + * bfqq. Note that these same results may be achieved with the general + * injection mechanism, but less effectively. For details on this + * aspect, see the comments on the choice of the queue for injection + * in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a queue Q is deemed + * as a waker queue for bfqq if, for three consecutive times, bfqq + * happens to become non empty right after a request of Q has been + * completed. In particular, on the first time, Q is tentatively set + * as a candidate waker queue, while on the third consecutive time + * that Q is detected, the field waker_bfqq is set to Q, to confirm + * that Q is a waker queue for bfqq. These detection steps are + * performed only if bfqq has a long think time, so as to make it more + * likely that bfqq's I/O is actually being blocked by a + * synchronization. This last filter, plus the above three-times + * requirement, make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner throughput can be + * boosted by injecting I/O from the waker queue. Fortunately, + * detection is likely to be actually fast, for the following + * reasons. While blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at least equal to 1 + * (see the comments in bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served during the very + * first I/O-plugging time interval for bfqq. This triggers the first + * step of the detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be confirmed no later than + * during the next I/O-plugging interval for bfqq. + * + * ISSUE + * + * On queue merging all waker information is lost. + */ +void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) +{ + if (!bfqd->last_completed_rq_bfqq || + bfqd->last_completed_rq_bfqq == bfqq || + bfq_bfqq_has_short_ttime(bfqq) || + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + return; + + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq) { + /* + * First synchronization detected with a + * candidate waker queue, or with a different + * candidate waker queue from the current one. + */ + bfqq->tentative_waker_bfqq = + bfqd->last_completed_rq_bfqq; + bfqq->num_waker_detections = 1; + } else /* Same tentative waker queue detected again */ + bfqq->num_waker_detections++; + + if (bfqq->num_waker_detections == 3) { + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1919,111 +2019,7 @@ static void bfq_add_request(struct request *rq) bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - /* - * Detect whether bfqq's I/O seems synchronized with - * that of some other queue, i.e., whether bfqq, after - * remaining empty, happens to receive new I/O only - * right after some I/O request of the other queue has - * been completed. We call waker queue the other - * queue, and we assume, for simplicity, that bfqq may - * have at most one waker queue. - * - * A remarkable throughput boost can be reached by - * unconditionally injecting the I/O of the waker - * queue, every time a new bfq_dispatch_request - * happens to be invoked while I/O is being plugged - * for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth - * and latency for bfqq. Note that these same results - * may be achieved with the general injection - * mechanism, but less effectively. For details on - * this aspect, see the comments on the choice of the - * queue for injection in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a - * queue Q is deemed as a waker queue for bfqq if, for - * two consecutive times, bfqq happens to become non - * empty right after a request of Q has been - * completed. In particular, on the first time, Q is - * tentatively set as a candidate waker queue, while - * on the second time, the flag - * bfq_bfqq_has_waker(bfqq) is set to confirm that Q - * is a waker queue for bfqq. These detection steps - * are performed only if bfqq has a long think time, - * so as to make it more likely that bfqq's I/O is - * actually being blocked by a synchronization. This - * last filter, plus the above two-times requirement, - * make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner - * throughput can be boosted by injecting I/O from the - * waker queue. Fortunately, detection is likely to be - * actually fast, for the following reasons. While - * blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at - * least equal to 1 (see the comments in - * bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served - * during the very first I/O-plugging time interval - * for bfqq. This triggers the first step of the - * detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be - * confirmed no later than during the next - * I/O-plugging interval for bfqq. - */ - if (bfqd->last_completed_rq_bfqq && - !bfq_bfqq_has_short_ttime(bfqq) && - now_ns - bfqd->last_completion < - 4 * NSEC_PER_MSEC) { - if (bfqd->last_completed_rq_bfqq != bfqq && - bfqd->last_completed_rq_bfqq != - bfqq->waker_bfqq) { - /* - * First synchronization detected with - * a candidate waker queue, or with a - * different candidate waker queue - * from the current one. - */ - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - - bfq_clear_bfqq_has_waker(bfqq); - } else if (bfqd->last_completed_rq_bfqq == - bfqq->waker_bfqq && - !bfq_bfqq_has_waker(bfqq)) { - /* - * synchronization with waker_bfqq - * seen for the second time - */ - bfq_mark_bfqq_has_waker(bfqq); - } - } + bfq_check_waker(bfqd, bfqq, now_ns); /* * Periodically reset inject limit, to make sure that @@ -4569,7 +4565,7 @@ check_queue: bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; - else if (bfq_bfqq_has_waker(bfqq) && + else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, @@ -4973,7 +4969,6 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; - bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3f350fa3c5fd..b8e793c34ff1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -376,6 +376,11 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; + /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ + struct bfq_queue *tentative_waker_bfqq; + /* number of times the same tentative waker has been detected */ + unsigned int num_waker_detections; + /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -776,7 +781,6 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ - BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -796,7 +800,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ -- cgit v1.2.3 From a5bf0a92e1b8282c93018383b2526ca59602dd08 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Jan 2021 21:15:01 -0700 Subject: bfq: bfq_check_waker() should be static It's only used in the same file, mark is appropriately static. Fixes: 71217df39dc6 ("block, bfq: make waker-queue detection more robust") Reported-by: kernel test robot Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index eaeda18cb8c8..23e293d2943c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1952,7 +1952,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * * On queue merging all waker information is lost. */ -void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) +static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + u64 now_ns) { if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || -- cgit v1.2.3 From 41e76c85660c022c6bf5713bfb6c21e64a487cec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:16 +0200 Subject: bfq: Avoid false bfq queue merging bfq_setup_cooperator() uses bfqd->in_serv_last_pos so detect whether it makes sense to merge current bfq queue with the in-service queue. However if the in-service queue is freshly scheduled and didn't dispatch any requests yet, bfqd->in_serv_last_pos is stale and contains value from the previously scheduled bfq queue which can thus result in a bogus decision that the two queues should be merged. This bug can be observed for example with the following fio jobfile: [global] direct=0 ioengine=sync invalidate=1 size=1g rw=read [reader] numjobs=4 directory=/mnt where the 4 processes will end up in the one shared bfq queue although they do IO to physically very distant files (for some reason I was able to observe this only with slice_idle=1ms setting). Fix the problem by invalidating bfqd->in_serv_last_pos when switching in-service queue. Fixes: 058fdecc6de7 ("block, bfq: fix in-service-queue check for queue merging") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 23e293d2943c..4157cfe99ae2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3007,6 +3007,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, } bfqd->in_service_queue = bfqq; + bfqd->in_serv_last_pos = 0; } /* -- cgit v1.2.3 From 28c6def009192b673f92ea357dfb535ba15e00a4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:17 +0200 Subject: bfq: Use 'ttime' local variable Use local variable 'ttime' instead of dereferencing bfqq. Signed-off-by: Jan Kara Acked-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4157cfe99ae2..a0471ff97120 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5282,7 +5282,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); -- cgit v1.2.3 From 7684fbde45169e6de15c180b1c084d2005e99961 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jun 2020 16:16:18 +0200 Subject: bfq: Use only idle IO periods for think time calculations Currently whenever bfq queue has a request queued we add now - last_completion_time to the think time statistics. This is however misleading in case the process is able to submit several requests in parallel because e.g. if the queue has request completed at time T0 and then queues new requests at times T1, T2, then we will add T1-T0 and T2-T0 to think time statistics which just doesn't make any sence (the queue's think time is penalized by the queue being able to submit more IO). So add to think time statistics only time intervals when the queue had no IO pending. Signed-off-by: Jan Kara Acked-by: Paolo Valente [axboe: fix whitespace on empty line] Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'block/bfq-iosched.c') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a0471ff97120..dfa87e360d71 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5278,8 +5278,16 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed; + /* + * We are really interested in how long it takes for the queue to + * become busy when there is no outstanding IO for this queue. So + * ignore cases when the bfq queue has already IO queued. + */ + if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) + return; + elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; -- cgit v1.2.3