From 3f0b0966b30982e843950b170b7a9ddfd8094428 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 31 Jul 2023 20:56:35 +0200 Subject: cpuidle: teo: Update idle duration estimate when choosing shallower state The TEO governor takes CPU utilization into account by refining idle state selection when the utilization is above a certain threshold. This is done by choosing an idle state shallower than the previously selected one. However, when doing this, the idle duration estimate needs to be adjusted so as to prevent the scheduler tick from being stopped when the candidate idle state is shallow, which may lead to excessive energy usage if the CPU is not woken up quickly enough going forward. Moreover, if the scheduler tick has been stopped already and the new idle duration estimate is too small, the replacement candidate state cannot be used. Modify the relevant code to take the above observations into account. Fixes: 9ce0f7c4bc64 ("cpuidle: teo: Introduce util-awareness") Link: https://lore.kernel.org/linux-pm/CAJZ5v0jJxHj65r2HXBTd3wfbZtsg=_StzwO1kA5STDnaPe_dWA@mail.gmail.com Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Kajetan Puchalski --- drivers/cpuidle/governors/teo.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 987fc5f3997d..2cdc711679a5 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -397,13 +397,23 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * the shallowest non-polling state and exit. */ if (drv->state_count < 3 && cpu_data->utilized) { - for (i = 0; i < drv->state_count; ++i) { - if (!dev->states_usage[i].disable && - !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { - idx = i; - goto end; - } - } + /* The CPU is utilized, so assume a short idle duration. */ + duration_ns = teo_middle_of_bin(0, drv); + /* + * If state 0 is enabled and it is not a polling one, select it + * right away unless the scheduler tick has been stopped, in + * which case care needs to be taken to leave the CPU in a deep + * enough state in case it is not woken up any time soon after + * all. If state 1 is disabled, though, state 0 must be used + * anyway. + */ + if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + teo_time_ok(duration_ns)) || dev->states_usage[1].disable) + idx = 0; + else /* Assume that state 1 is not a polling one and use it. */ + idx = 1; + + goto end; } /* @@ -539,10 +549,20 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* * If the CPU is being utilized over the threshold, choose a shallower - * non-polling state to improve latency + * non-polling state to improve latency, unless the scheduler tick has + * been stopped already and the shallower state's target residency is + * not sufficiently large. */ - if (cpu_data->utilized) - idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + if (cpu_data->utilized) { + s64 span_ns; + + i = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + span_ns = teo_middle_of_bin(i, drv); + if (teo_time_ok(span_ns)) { + idx = i; + duration_ns = span_ns; + } + } end: /* -- cgit v1.2.3 From 04bae4e2267d49eb9cfec403acd0462b8d00d637 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 31 Jul 2023 21:03:09 +0200 Subject: cpuidle: teo: Avoid stopping the tick unnecessarily when bailing out When teo_select() is going to return early in some special cases, make it avoid stopping the tick if the idle state to be returned is shallow. In particular, never stop the tick if state 0 is to be returned. Link: https://lore.kernel.org/linux-pm/CAJZ5v0jJxHj65r2HXBTd3wfbZtsg=_StzwO1kA5STDnaPe_dWA@mail.gmail.com Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Kajetan Puchalski --- drivers/cpuidle/governors/teo.c | 56 ++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 2cdc711679a5..1b76db3689d0 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -382,12 +382,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Check if there is any choice in the first place. */ if (drv->state_count < 2) { idx = 0; - goto end; + goto out_tick; } + if (!dev->states_usage[0].disable) { idx = 0; if (drv->states[1].target_residency_ns > duration_ns) - goto end; + goto out_tick; } cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); @@ -408,11 +409,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * anyway. */ if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - teo_time_ok(duration_ns)) || dev->states_usage[1].disable) + teo_time_ok(duration_ns)) || dev->states_usage[1].disable) { idx = 0; - else /* Assume that state 1 is not a polling one and use it. */ - idx = 1; - + goto out_tick; + } + /* Assume that state 1 is not a polling one and use it. */ + idx = 1; goto end; } @@ -459,8 +461,15 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Avoid unnecessary overhead. */ if (idx < 0) { idx = 0; /* No states enabled, must use 0. */ - goto end; - } else if (idx == idx0) { + goto out_tick; + } + + if (idx == idx0) { + /* + * This is the first enabled idle state, so use it, but do not + * allow the tick to be stopped it is shallow enough. + */ + duration_ns = drv->states[idx].target_residency_ns; goto end; } @@ -566,24 +575,25 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, end: /* - * Don't stop the tick if the selected state is a polling one or if the - * expected idle duration is shorter than the tick period length. + * Allow the tick to be stopped unless the selected state is a polling + * one or the expected idle duration is shorter than the tick period + * length. */ - if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { - *stop_tick = false; + if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped()) + return idx; - /* - * The tick is not going to be stopped, so if the target - * residency of the state to be returned is not within the time - * till the closest timer including the tick, try to correct - * that. - */ - if (idx > idx0 && - drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); - } + /* + * The tick is not going to be stopped, so if the target residency of + * the state to be returned is not within the time till the closest + * timer including the tick, try to correct that. + */ + if (idx > idx0 && + drv->states[idx].target_residency_ns > delta_tick) + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); +out_tick: + *stop_tick = false; return idx; } -- cgit v1.2.3 From 9a41e16f11103c37263271ff2a433f2f3885fced Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 31 Jul 2023 21:04:41 +0200 Subject: cpuidle: teo: Drop utilized from struct teo_cpu Because the utilized field in struct teo_cpu is only used locally in teo_select(), replace it with a local variable in that function. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Kajetan Puchalski --- drivers/cpuidle/governors/teo.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 1b76db3689d0..3915e5639790 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -187,7 +187,6 @@ struct teo_bin { * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". * @util_threshold: Threshold above which the CPU is considered utilized - * @utilized: Whether the last sleep on the CPU happened while utilized */ struct teo_cpu { s64 time_span_ns; @@ -197,7 +196,6 @@ struct teo_cpu { int next_recent_idx; int recent_idx[NR_RECENT]; unsigned long util_threshold; - bool utilized; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -366,6 +364,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int idx0 = 0, idx = -1; bool alt_intercepts, alt_recent; ktime_t delta_tick; + bool cpu_utilized; s64 duration_ns; int i; @@ -391,13 +390,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto out_tick; } - cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); /* * If the CPU is being utilized over the threshold and there are only 2 * states to choose from, the metrics need not be considered, so choose * the shallowest non-polling state and exit. */ - if (drv->state_count < 3 && cpu_data->utilized) { + if (drv->state_count < 3 && cpu_utilized) { /* The CPU is utilized, so assume a short idle duration. */ duration_ns = teo_middle_of_bin(0, drv); /* @@ -562,7 +561,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * been stopped already and the shallower state's target residency is * not sufficiently large. */ - if (cpu_data->utilized) { + if (cpu_utilized) { s64 span_ns; i = teo_find_shallower_state(drv, dev, idx, duration_ns, true); -- cgit v1.2.3 From 21d28cd2fa5fc01ee83e64df838ecd72112c09b7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 3 Aug 2023 23:07:41 +0200 Subject: cpuidle: teo: Do not call tick_nohz_get_sleep_length() upfront Because the cost of calling tick_nohz_get_sleep_length() may increase in the future, reorder the code in teo_select() so it first uses the statistics to pick up a candidate idle state and applies the utilization heuristic to it and only then calls tick_nohz_get_sleep_length() to obtain the sleep length value and refine the selection if necessary. This change by itself does not cause tick_nohz_get_sleep_length() to be called less often, but it prepares the code for subsequent changes that will do so. Signed-off-by: Rafael J. Wysocki Tested-by: Kajetan Puchalski Tested-by: Anna-Maria Behnsen --- drivers/cpuidle/governors/teo.c | 105 +++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 61 deletions(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 3915e5639790..4ef76cdfc5c8 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -306,15 +306,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->total += PULSE; } -static bool teo_time_ok(u64 interval_ns) +static bool teo_state_ok(int i, struct cpuidle_driver *drv) { - return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; -} - -static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) -{ - return (drv->states[idx].target_residency_ns + - drv->states[idx+1].target_residency_ns) / 2; + return !tick_nohz_tick_stopped() || + drv->states[i].target_residency_ns >= TICK_NSEC; } /** @@ -354,6 +349,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + ktime_t delta_tick = TICK_NSEC / 2; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; unsigned int idx_recent_sum = 0; @@ -363,7 +359,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int constraint_idx = 0; int idx0 = 0, idx = -1; bool alt_intercepts, alt_recent; - ktime_t delta_tick; bool cpu_utilized; s64 duration_ns; int i; @@ -374,9 +369,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } cpu_data->time_span_ns = local_clock(); - - duration_ns = tick_nohz_get_sleep_length(&delta_tick); - cpu_data->sleep_length_ns = duration_ns; + /* + * Set the expected sleep length to infinity in case of an early + * return. + */ + cpu_data->sleep_length_ns = KTIME_MAX; /* Check if there is any choice in the first place. */ if (drv->state_count < 2) { @@ -384,11 +381,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto out_tick; } - if (!dev->states_usage[0].disable) { + if (!dev->states_usage[0].disable) idx = 0; - if (drv->states[1].target_residency_ns > duration_ns) - goto out_tick; - } cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); /* @@ -397,8 +391,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * the shallowest non-polling state and exit. */ if (drv->state_count < 3 && cpu_utilized) { - /* The CPU is utilized, so assume a short idle duration. */ - duration_ns = teo_middle_of_bin(0, drv); /* * If state 0 is enabled and it is not a polling one, select it * right away unless the scheduler tick has been stopped, in @@ -408,22 +400,17 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * anyway. */ if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - teo_time_ok(duration_ns)) || dev->states_usage[1].disable) { + teo_state_ok(0, drv)) || dev->states_usage[1].disable) { idx = 0; goto out_tick; } /* Assume that state 1 is not a polling one and use it. */ idx = 1; + duration_ns = drv->states[1].target_residency_ns; goto end; } - /* - * Find the deepest idle state whose target residency does not exceed - * the current sleep length and the deepest idle state not deeper than - * the former whose exit latency does not exceed the current latency - * constraint. Compute the sums of metrics for early wakeup pattern - * detection. - */ + /* Compute the sums of metrics for early wakeup pattern detection. */ for (i = 1; i < drv->state_count; i++) { struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; struct cpuidle_state *s = &drv->states[i]; @@ -439,19 +426,15 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (dev->states_usage[i].disable) continue; - if (idx < 0) { - idx = i; /* first enabled state */ - idx0 = i; - } - - if (s->target_residency_ns > duration_ns) - break; + if (idx < 0) + idx0 = i; /* first enabled state */ idx = i; if (s->exit_latency_ns <= latency_req) constraint_idx = i; + /* Save the sums for the current state. */ idx_intercept_sum = intercept_sum; idx_hit_sum = hit_sum; idx_recent_sum = recent_sum; @@ -465,7 +448,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx == idx0) { /* - * This is the first enabled idle state, so use it, but do not + * Only one idle state is enabled, so use it, but do not * allow the tick to be stopped it is shallow enough. */ duration_ns = drv->states[idx].target_residency_ns; @@ -479,13 +462,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * all of the deeper states, or the sum of the numbers of recent * intercepts over all of the states shallower than the candidate one * is greater than a half of the number of recent events taken into - * account, the CPU is likely to wake up early, so find an alternative - * idle state to select. + * account, a shallower idle state is likely to be a better choice. */ alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; alt_recent = idx_recent_sum > NR_RECENT / 2; if (alt_recent || alt_intercepts) { - s64 first_suitable_span_ns = duration_ns; int first_suitable_idx = idx; /* @@ -494,44 +475,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * cases (both with respect to intercepts overall and with * respect to the recent intercepts only) in the past. * - * Take the possible latency constraint and duration limitation - * present if the tick has been stopped already into account. + * Take the possible duration limitation present if the tick + * has been stopped already into account. */ intercept_sum = 0; recent_sum = 0; for (i = idx - 1; i >= 0; i--) { struct teo_bin *bin = &cpu_data->state_bins[i]; - s64 span_ns; intercept_sum += bin->intercepts; recent_sum += bin->recent; - span_ns = teo_middle_of_bin(i, drv); - if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && (!alt_intercepts || 2 * intercept_sum > idx_intercept_sum)) { - if (teo_time_ok(span_ns) && - !dev->states_usage[i].disable) { + /* + * Use the current state unless it is too + * shallow or disabled, in which case take the + * first enabled state that is deep enough. + */ + if (teo_state_ok(i, drv) && + !dev->states_usage[i].disable) idx = i; - duration_ns = span_ns; - } else { - /* - * The current state is too shallow or - * disabled, so take the first enabled - * deeper state with suitable time span. - */ + else idx = first_suitable_idx; - duration_ns = first_suitable_span_ns; - } + break; } if (dev->states_usage[i].disable) continue; - if (!teo_time_ok(span_ns)) { + if (!teo_state_ok(i, drv)) { /* * The current state is too shallow, but if an * alternative candidate state has been found, @@ -543,7 +519,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, break; } - first_suitable_span_ns = span_ns; first_suitable_idx = i; } } @@ -562,14 +537,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * not sufficiently large. */ if (cpu_utilized) { - s64 span_ns; + i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true); + if (teo_state_ok(i, drv)) + idx = i; + } - i = teo_find_shallower_state(drv, dev, idx, duration_ns, true); - span_ns = teo_middle_of_bin(i, drv); - if (teo_time_ok(span_ns)) { + duration_ns = tick_nohz_get_sleep_length(&delta_tick); + cpu_data->sleep_length_ns = duration_ns; + + /* + * If the closest expected timer is before the terget residency of the + * candidate state, a shallower one needs to be found. + */ + if (drv->states[idx].target_residency_ns > duration_ns) { + i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); + if (teo_state_ok(i, drv)) idx = i; - duration_ns = span_ns; - } } end: -- cgit v1.2.3 From 6da8f9ba5a87f8aecf2cd4441cc6024a77e5b645 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 3 Aug 2023 23:09:18 +0200 Subject: cpuidle: teo: Skip tick_nohz_get_sleep_length() call in some cases Make teo_select() avoid calling tick_nohz_get_sleep_length() if the candidate idle state to return is state 0 or if state 0 is a polling one and the target residency of the current candidate one is below a certain threshold, in which cases it may be assumed that the CPU will be woken up immediately by a non-timer wakeup source and the timers are not likely to matter. Signed-off-by: Rafael J. Wysocki Tested-by: Kajetan Puchalski Tested-by: Anna-Maria Behnsen --- drivers/cpuidle/governors/teo.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 4ef76cdfc5c8..8248492cad3a 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -166,6 +166,12 @@ */ #define NR_RECENT 9 +/* + * Idle state target residency threshold used for deciding whether or not to + * check the time till the closest expected timer event. + */ +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) + /** * struct teo_bin - Metrics used by the TEO cpuidle governor. * @intercepts: The "intercepts" metric. @@ -542,6 +548,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = i; } + /* + * Skip the timers check if state 0 is the current candidate one, + * because an immediate non-timer wakeup is expected in that case. + */ + if (!idx) + goto out_tick; + + /* + * If state 0 is a polling one, check if the target residency of + * the current candidate state is low enough and skip the timers + * check in that case too. + */ + if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + goto out_tick; + duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; -- cgit v1.2.3 From 2662342079f54b8a940f7094c197c99458caeb0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 3 Aug 2023 23:11:40 +0200 Subject: cpuidle: teo: Gather statistics regarding whether or not to stop the tick Currently, if the target residency of the deepest idle state is less than the tick period length, which is quite likely for HZ=100, and the deepest idle state is about to be selected by the TEO idle governor, the decision on whether or not to stop the scheduler tick is based entirely on the time till the closest timer. This is often insufficient, because timers may not be in heavy use and there may be a plenty of other CPU wakeup events between the deepest idle state's target residency and the closest tick. Allow the governor to count those events by making the deepest idle state's bin effectively end at TICK_NSEC and introducing an additional "bin" for collecting "hit" events (ie. the ones in which the measured idle duration falls into the same bin as the time till the closest timer) with idle duration values past TICK_NSEC. This way the "intercepts" metric for the deepest idle state's bin becomes nonzero in general, and so it can influence the decision on whether or not to stop the tick possibly increasing the governor's accuracy in that respect. Signed-off-by: Rafael J. Wysocki Tested-by: Kajetan Puchalski Tested-by: Anna-Maria Behnsen --- drivers/cpuidle/governors/teo.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 8248492cad3a..70846b669255 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -192,6 +192,7 @@ struct teo_bin { * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @tick_hits: Number of "hits" after TICK_NSEC. * @util_threshold: Threshold above which the CPU is considered utilized */ struct teo_cpu { @@ -201,6 +202,7 @@ struct teo_cpu { unsigned int total; int next_recent_idx; int recent_idx[NR_RECENT]; + unsigned int tick_hits; unsigned long util_threshold; }; @@ -232,6 +234,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); int i, idx_timer = 0, idx_duration = 0; + s64 target_residency_ns; u64 measured_ns; if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { @@ -272,7 +275,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * fall into. */ for (i = 0; i < drv->state_count; i++) { - s64 target_residency_ns = drv->states[i].target_residency_ns; struct teo_bin *bin = &cpu_data->state_bins[i]; bin->hits -= bin->hits >> DECAY_SHIFT; @@ -280,6 +282,8 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->total += bin->hits + bin->intercepts; + target_residency_ns = drv->states[i].target_residency_ns; + if (target_residency_ns <= cpu_data->sleep_length_ns) { idx_timer = i; if (target_residency_ns <= measured_ns) @@ -294,6 +298,26 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (cpu_data->recent_idx[i] >= 0) cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; + /* + * If the deepest state's target residency is below the tick length, + * make a record of it to help teo_select() decide whether or not + * to stop the tick. This effectively adds an extra hits-only bin + * beyond the last state-related one. + */ + if (target_residency_ns < TICK_NSEC) { + cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; + + cpu_data->total += cpu_data->tick_hits; + + if (TICK_NSEC <= cpu_data->sleep_length_ns) { + idx_timer = drv->state_count; + if (TICK_NSEC <= measured_ns) { + cpu_data->tick_hits += PULSE; + goto end; + } + } + } + /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -309,6 +333,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->recent_idx[i] = idx_duration; } +end: cpu_data->total += PULSE; } @@ -356,6 +381,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; + unsigned int tick_intercept_sum = 0; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; unsigned int idx_recent_sum = 0; @@ -429,6 +455,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, hit_sum += prev_bin->hits; recent_sum += prev_bin->recent; + tick_intercept_sum = intercept_sum; + if (dev->states_usage[i].disable) continue; @@ -461,6 +489,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } + tick_intercept_sum += cpu_data->state_bins[drv->state_count-1].intercepts; + /* * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the @@ -577,6 +607,15 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = i; } + /* + * If the selected state's target residency is below the tick length + * and intercepts occurring before the tick length are the majority of + * total wakeup events, do not stop the tick. + */ + if (drv->states[idx].target_residency_ns < TICK_NSEC && + tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + duration_ns = TICK_NSEC / 2; + end: /* * Allow the tick to be stopped unless the selected state is a polling -- cgit v1.2.3 From 5484e31bbbff285f9505c4766373f840ffb746e5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Aug 2023 20:36:36 +0200 Subject: cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases Because the cost of calling tick_nohz_get_sleep_length() may increase in the future, reorder the code in menu_select() so it first uses the statistics to determine the expected idle duration. If that value is higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will be called to obtain the time till the closest timer and refine the idle duration prediction if necessary. This causes the governor to always take the full overhead of get_typical_interval() with the assumption that the cost will be amortized by skipping the tick_nohz_get_sleep_length() call in the cases when the predicted idle duration is relatively very small. Signed-off-by: Rafael J. Wysocki Tested-by: Doug Smythies --- drivers/cpuidle/governors/gov.h | 14 +++++++++ drivers/cpuidle/governors/menu.c | 65 +++++++++++++++++++++++----------------- drivers/cpuidle/governors/teo.c | 9 ++---- 3 files changed, 54 insertions(+), 34 deletions(-) create mode 100644 drivers/cpuidle/governors/gov.h (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h new file mode 100644 index 000000000000..99e067d9668c --- /dev/null +++ b/drivers/cpuidle/governors/gov.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Common definitions for cpuidle governors. */ + +#ifndef __CPUIDLE_GOVERNOR_H +#define __CPUIDLE_GOVERNOR_H + +/* + * Idle state target residency threshold used for deciding whether or not to + * check the time till the closest expected timer event. + */ +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) + +#endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c4922684f305..b96e3da0fedd 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,6 +19,8 @@ #include #include +#include "gov.h" + #define BUCKETS 12 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) @@ -166,8 +168,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static unsigned int get_typical_interval(struct menu_device *data, - unsigned int predicted_us) +static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; unsigned int min, max, thresh, avg; @@ -195,11 +196,7 @@ again: } } - /* - * If the result of the computation is going to be discarded anyway, - * avoid the computation altogether. - */ - if (min >= predicted_us) + if (!max) return UINT_MAX; if (divisor == INTERVALS) @@ -267,7 +264,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - unsigned int predicted_us; u64 predicted_ns; u64 interactivity_req; unsigned int nr_iowaiters; @@ -279,16 +275,41 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - /* determine the expected residency time, round up */ - delta = tick_nohz_get_sleep_length(&delta_tick); - if (unlikely(delta < 0)) { - delta = 0; - delta_tick = 0; - } - data->next_timer_ns = delta; - nr_iowaiters = nr_iowait_cpu(dev->cpu); - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + + /* Find the shortest expected idle interval. */ + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; + if (predicted_ns > RESIDENCY_THRESHOLD_NS) { + unsigned int timer_us; + + /* Determine the time till the closest timer. */ + delta = tick_nohz_get_sleep_length(&delta_tick); + if (unlikely(delta < 0)) { + delta = 0; + delta_tick = 0; + } + + data->next_timer_ns = delta; + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + + /* Round up the result for half microseconds. */ + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + + data->next_timer_ns * + data->correction_factor[data->bucket], + RESOLUTION * DECAY * NSEC_PER_USEC); + /* Use the lowest expected idle interval to pick the idle state. */ + predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); + } else { + /* + * Because the next timer event is not going to be determined + * in this case, assume that without the tick the closest timer + * will be in distant future and that the closest tick will occur + * after 1/2 of the tick period. + */ + data->next_timer_ns = KTIME_MAX; + delta_tick = TICK_NSEC / 2; + data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + } if (unlikely(drv->state_count <= 1 || latency_req == 0) || ((data->next_timer_ns < drv->states[1].target_residency_ns || @@ -303,16 +324,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, return 0; } - /* Round up the result for half microseconds. */ - predicted_us = div_u64(data->next_timer_ns * - data->correction_factor[data->bucket] + - (RESOLUTION * DECAY * NSEC_PER_USEC) / 2, - RESOLUTION * DECAY * NSEC_PER_USEC); - /* Use the lowest expected idle interval to pick the idle state. */ - predicted_ns = (u64)min(predicted_us, - get_typical_interval(data, predicted_us)) * - NSEC_PER_USEC; - if (tick_nohz_tick_stopped()) { /* * If the tick is already stopped, the cost of possible short diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 70846b669255..31050fdc633f 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -140,6 +140,8 @@ #include #include +#include "gov.h" + /* * The number of bits to shift the CPU's capacity by in order to determine * the utilized threshold. @@ -152,7 +154,6 @@ */ #define UTIL_THRESHOLD_SHIFT 6 - /* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. @@ -166,12 +167,6 @@ */ #define NR_RECENT 9 -/* - * Idle state target residency threshold used for deciding whether or not to - * check the time till the closest expected timer event. - */ -#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) - /** * struct teo_bin - Metrics used by the TEO cpuidle governor. * @intercepts: The "intercepts" metric. -- cgit v1.2.3 From 78aabcb3211aa4b7e8f8a1d5d4bdad699a42d4ba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 22 Aug 2023 13:28:02 +0200 Subject: cpuidle: teo: Avoid unnecessary variable assignments Notice that it is not necessary to assign tick_intercept_sum in every iteration of the first loop over idle states in teo_select(), because the intercept_sum value does not change after the assignment in a given iteration of the loop, so its value after the last iteration of the loop can be used for computing the tick_intercept_sum value directly. Modify the code accordingly. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/cpuidle') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 31050fdc633f..7244f71c59c5 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -450,8 +450,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, hit_sum += prev_bin->hits; recent_sum += prev_bin->recent; - tick_intercept_sum = intercept_sum; - if (dev->states_usage[i].disable) continue; @@ -484,7 +482,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } - tick_intercept_sum += cpu_data->state_bins[drv->state_count-1].intercepts; + tick_intercept_sum = intercept_sum + + cpu_data->state_bins[drv->state_count-1].intercepts; /* * If the sum of the intercepts metric for all of the idle states -- cgit v1.2.3