diff options
-rw-r--r-- | drivers/cpuidle/governors/gov.h | 14 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 65 | ||||
-rw-r--r-- | drivers/cpuidle/governors/teo.c | 235 | ||||
-rw-r--r-- | drivers/devfreq/devfreq.c | 10 | ||||
-rw-r--r-- | drivers/devfreq/imx-bus.c | 2 | ||||
-rw-r--r-- | drivers/devfreq/imx8m-ddrc.c | 2 | ||||
-rw-r--r-- | drivers/devfreq/mtk-cci-devfreq.c | 1 | ||||
-rw-r--r-- | drivers/devfreq/tegra30-devfreq.c | 2 | ||||
-rw-r--r-- | drivers/powercap/arm_scmi_powercap.c | 159 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_common.c | 2 | ||||
-rw-r--r-- | include/linux/pm_runtime.h | 2 | ||||
-rw-r--r-- | include/linux/pm_wakeup.h | 10 | ||||
-rw-r--r-- | kernel/power/qos.c | 9 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 187 | ||||
-rw-r--r-- | tools/power/cpupower/Makefile | 2 | ||||
-rw-r--r-- | tools/power/cpupower/lib/cpupower.c | 7 | ||||
-rw-r--r-- | tools/power/cpupower/lib/cpupower_intern.h | 1 | ||||
-rw-r--r-- | tools/power/cpupower/utils/cpuidle-set.c | 16 | ||||
-rw-r--r-- | tools/power/cpupower/utils/cpupower-set.c | 65 | ||||
-rw-r--r-- | tools/power/cpupower/utils/helpers/helpers.h | 11 | ||||
-rw-r--r-- | tools/power/cpupower/utils/helpers/misc.c | 57 |
21 files changed, 617 insertions, 242 deletions
diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h new file mode 100644 index 000000000000..99e067d9668c --- /dev/null +++ b/drivers/cpuidle/governors/gov.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Common definitions for cpuidle governors. */ + +#ifndef __CPUIDLE_GOVERNOR_H +#define __CPUIDLE_GOVERNOR_H + +/* + * Idle state target residency threshold used for deciding whether or not to + * check the time till the closest expected timer event. + */ +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) + +#endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c4922684f305..b96e3da0fedd 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,6 +19,8 @@ #include <linux/sched/stat.h> #include <linux/math64.h> +#include "gov.h" + #define BUCKETS 12 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) @@ -166,8 +168,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static unsigned int get_typical_interval(struct menu_device *data, - unsigned int predicted_us) +static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; unsigned int min, max, thresh, avg; @@ -195,11 +196,7 @@ again: } } - /* - * If the result of the computation is going to be discarded anyway, - * avoid the computation altogether. - */ - if (min >= predicted_us) + if (!max) return UINT_MAX; if (divisor == INTERVALS) @@ -267,7 +264,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - unsigned int predicted_us; u64 predicted_ns; u64 interactivity_req; unsigned int nr_iowaiters; @@ -279,16 +275,41 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - /* determine the expected residency time, round up */ - delta = tick_nohz_get_sleep_length(&delta_tick); - if (unlikely(delta < 0)) { - delta = 0; - delta_tick = 0; - } - data->next_timer_ns = delta; - nr_iowaiters = nr_iowait_cpu(dev->cpu); - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + + /* Find the shortest expected idle interval. */ + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; + if (predicted_ns > RESIDENCY_THRESHOLD_NS) { + unsigned int timer_us; + + /* Determine the time till the closest timer. */ + delta = tick_nohz_get_sleep_length(&delta_tick); + if (unlikely(delta < 0)) { + delta = 0; + delta_tick = 0; + } + + data->next_timer_ns = delta; + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + + /* Round up the result for half microseconds. */ + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + + data->next_timer_ns * + data->correction_factor[data->bucket], + RESOLUTION * DECAY * NSEC_PER_USEC); + /* Use the lowest expected idle interval to pick the idle state. */ + predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); + } else { + /* + * Because the next timer event is not going to be determined + * in this case, assume that without the tick the closest timer + * will be in distant future and that the closest tick will occur + * after 1/2 of the tick period. + */ + data->next_timer_ns = KTIME_MAX; + delta_tick = TICK_NSEC / 2; + data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + } if (unlikely(drv->state_count <= 1 || latency_req == 0) || ((data->next_timer_ns < drv->states[1].target_residency_ns || @@ -303,16 +324,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, return 0; } - /* Round up the result for half microseconds. */ - predicted_us = div_u64(data->next_timer_ns * - data->correction_factor[data->bucket] + - (RESOLUTION * DECAY * NSEC_PER_USEC) / 2, - RESOLUTION * DECAY * NSEC_PER_USEC); - /* Use the lowest expected idle interval to pick the idle state. */ - predicted_ns = (u64)min(predicted_us, - get_typical_interval(data, predicted_us)) * - NSEC_PER_USEC; - if (tick_nohz_tick_stopped()) { /* * If the tick is already stopped, the cost of possible short diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 987fc5f3997d..7244f71c59c5 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -140,6 +140,8 @@ #include <linux/sched/topology.h> #include <linux/tick.h> +#include "gov.h" + /* * The number of bits to shift the CPU's capacity by in order to determine * the utilized threshold. @@ -152,7 +154,6 @@ */ #define UTIL_THRESHOLD_SHIFT 6 - /* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. @@ -186,8 +187,8 @@ struct teo_bin { * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @tick_hits: Number of "hits" after TICK_NSEC. * @util_threshold: Threshold above which the CPU is considered utilized - * @utilized: Whether the last sleep on the CPU happened while utilized */ struct teo_cpu { s64 time_span_ns; @@ -196,8 +197,8 @@ struct teo_cpu { unsigned int total; int next_recent_idx; int recent_idx[NR_RECENT]; + unsigned int tick_hits; unsigned long util_threshold; - bool utilized; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -228,6 +229,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); int i, idx_timer = 0, idx_duration = 0; + s64 target_residency_ns; u64 measured_ns; if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { @@ -268,7 +270,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * fall into. */ for (i = 0; i < drv->state_count; i++) { - s64 target_residency_ns = drv->states[i].target_residency_ns; struct teo_bin *bin = &cpu_data->state_bins[i]; bin->hits -= bin->hits >> DECAY_SHIFT; @@ -276,6 +277,8 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->total += bin->hits + bin->intercepts; + target_residency_ns = drv->states[i].target_residency_ns; + if (target_residency_ns <= cpu_data->sleep_length_ns) { idx_timer = i; if (target_residency_ns <= measured_ns) @@ -291,6 +294,26 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; /* + * If the deepest state's target residency is below the tick length, + * make a record of it to help teo_select() decide whether or not + * to stop the tick. This effectively adds an extra hits-only bin + * beyond the last state-related one. + */ + if (target_residency_ns < TICK_NSEC) { + cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; + + cpu_data->total += cpu_data->tick_hits; + + if (TICK_NSEC <= cpu_data->sleep_length_ns) { + idx_timer = drv->state_count; + if (TICK_NSEC <= measured_ns) { + cpu_data->tick_hits += PULSE; + goto end; + } + } + } + + /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. * Otherwise, update the "intercepts" metric for the bin fallen into by @@ -305,18 +328,14 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->recent_idx[i] = idx_duration; } +end: cpu_data->total += PULSE; } -static bool teo_time_ok(u64 interval_ns) +static bool teo_state_ok(int i, struct cpuidle_driver *drv) { - return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; -} - -static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) -{ - return (drv->states[idx].target_residency_ns + - drv->states[idx+1].target_residency_ns) / 2; + return !tick_nohz_tick_stopped() || + drv->states[i].target_residency_ns >= TICK_NSEC; } /** @@ -356,6 +375,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + ktime_t delta_tick = TICK_NSEC / 2; + unsigned int tick_intercept_sum = 0; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; unsigned int idx_recent_sum = 0; @@ -365,7 +386,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int constraint_idx = 0; int idx0 = 0, idx = -1; bool alt_intercepts, alt_recent; - ktime_t delta_tick; + bool cpu_utilized; s64 duration_ns; int i; @@ -375,44 +396,48 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } cpu_data->time_span_ns = local_clock(); - - duration_ns = tick_nohz_get_sleep_length(&delta_tick); - cpu_data->sleep_length_ns = duration_ns; + /* + * Set the expected sleep length to infinity in case of an early + * return. + */ + cpu_data->sleep_length_ns = KTIME_MAX; /* Check if there is any choice in the first place. */ if (drv->state_count < 2) { idx = 0; - goto end; + goto out_tick; } - if (!dev->states_usage[0].disable) { + + if (!dev->states_usage[0].disable) idx = 0; - if (drv->states[1].target_residency_ns > duration_ns) - goto end; - } - cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); /* * If the CPU is being utilized over the threshold and there are only 2 * states to choose from, the metrics need not be considered, so choose * the shallowest non-polling state and exit. */ - if (drv->state_count < 3 && cpu_data->utilized) { - for (i = 0; i < drv->state_count; ++i) { - if (!dev->states_usage[i].disable && - !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { - idx = i; - goto end; - } + if (drv->state_count < 3 && cpu_utilized) { + /* + * If state 0 is enabled and it is not a polling one, select it + * right away unless the scheduler tick has been stopped, in + * which case care needs to be taken to leave the CPU in a deep + * enough state in case it is not woken up any time soon after + * all. If state 1 is disabled, though, state 0 must be used + * anyway. + */ + if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + teo_state_ok(0, drv)) || dev->states_usage[1].disable) { + idx = 0; + goto out_tick; } + /* Assume that state 1 is not a polling one and use it. */ + idx = 1; + duration_ns = drv->states[1].target_residency_ns; + goto end; } - /* - * Find the deepest idle state whose target residency does not exceed - * the current sleep length and the deepest idle state not deeper than - * the former whose exit latency does not exceed the current latency - * constraint. Compute the sums of metrics for early wakeup pattern - * detection. - */ + /* Compute the sums of metrics for early wakeup pattern detection. */ for (i = 1; i < drv->state_count; i++) { struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; struct cpuidle_state *s = &drv->states[i]; @@ -428,19 +453,15 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (dev->states_usage[i].disable) continue; - if (idx < 0) { - idx = i; /* first enabled state */ - idx0 = i; - } - - if (s->target_residency_ns > duration_ns) - break; + if (idx < 0) + idx0 = i; /* first enabled state */ idx = i; if (s->exit_latency_ns <= latency_req) constraint_idx = i; + /* Save the sums for the current state. */ idx_intercept_sum = intercept_sum; idx_hit_sum = hit_sum; idx_recent_sum = recent_sum; @@ -449,11 +470,21 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Avoid unnecessary overhead. */ if (idx < 0) { idx = 0; /* No states enabled, must use 0. */ - goto end; - } else if (idx == idx0) { + goto out_tick; + } + + if (idx == idx0) { + /* + * Only one idle state is enabled, so use it, but do not + * allow the tick to be stopped it is shallow enough. + */ + duration_ns = drv->states[idx].target_residency_ns; goto end; } + tick_intercept_sum = intercept_sum + + cpu_data->state_bins[drv->state_count-1].intercepts; + /* * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the @@ -461,13 +492,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * all of the deeper states, or the sum of the numbers of recent * intercepts over all of the states shallower than the candidate one * is greater than a half of the number of recent events taken into - * account, the CPU is likely to wake up early, so find an alternative - * idle state to select. + * account, a shallower idle state is likely to be a better choice. */ alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; alt_recent = idx_recent_sum > NR_RECENT / 2; if (alt_recent || alt_intercepts) { - s64 first_suitable_span_ns = duration_ns; int first_suitable_idx = idx; /* @@ -476,44 +505,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * cases (both with respect to intercepts overall and with * respect to the recent intercepts only) in the past. * - * Take the possible latency constraint and duration limitation - * present if the tick has been stopped already into account. + * Take the possible duration limitation present if the tick + * has been stopped already into account. */ intercept_sum = 0; recent_sum = 0; for (i = idx - 1; i >= 0; i--) { struct teo_bin *bin = &cpu_data->state_bins[i]; - s64 span_ns; intercept_sum += bin->intercepts; recent_sum += bin->recent; - span_ns = teo_middle_of_bin(i, drv); - if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && (!alt_intercepts || 2 * intercept_sum > idx_intercept_sum)) { - if (teo_time_ok(span_ns) && - !dev->states_usage[i].disable) { + /* + * Use the current state unless it is too + * shallow or disabled, in which case take the + * first enabled state that is deep enough. + */ + if (teo_state_ok(i, drv) && + !dev->states_usage[i].disable) idx = i; - duration_ns = span_ns; - } else { - /* - * The current state is too shallow or - * disabled, so take the first enabled - * deeper state with suitable time span. - */ + else idx = first_suitable_idx; - duration_ns = first_suitable_span_ns; - } + break; } if (dev->states_usage[i].disable) continue; - if (!teo_time_ok(span_ns)) { + if (!teo_state_ok(i, drv)) { /* * The current state is too shallow, but if an * alternative candidate state has been found, @@ -525,7 +549,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, break; } - first_suitable_span_ns = span_ns; first_suitable_idx = i; } } @@ -539,31 +562,75 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* * If the CPU is being utilized over the threshold, choose a shallower - * non-polling state to improve latency + * non-polling state to improve latency, unless the scheduler tick has + * been stopped already and the shallower state's target residency is + * not sufficiently large. */ - if (cpu_data->utilized) - idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + if (cpu_utilized) { + i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true); + if (teo_state_ok(i, drv)) + idx = i; + } -end: /* - * Don't stop the tick if the selected state is a polling one or if the - * expected idle duration is shorter than the tick period length. + * Skip the timers check if state 0 is the current candidate one, + * because an immediate non-timer wakeup is expected in that case. */ - if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { - *stop_tick = false; + if (!idx) + goto out_tick; - /* - * The tick is not going to be stopped, so if the target - * residency of the state to be returned is not within the time - * till the closest timer including the tick, try to correct - * that. - */ - if (idx > idx0 && - drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); + /* + * If state 0 is a polling one, check if the target residency of + * the current candidate state is low enough and skip the timers + * check in that case too. + */ + if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + goto out_tick; + + duration_ns = tick_nohz_get_sleep_length(&delta_tick); + cpu_data->sleep_length_ns = duration_ns; + + /* + * If the closest expected timer is before the terget residency of the + * candidate state, a shallower one needs to be found. + */ + if (drv->states[idx].target_residency_ns > duration_ns) { + i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); + if (teo_state_ok(i, drv)) + idx = i; } + /* + * If the selected state's target residency is below the tick length + * and intercepts occurring before the tick length are the majority of + * total wakeup events, do not stop the tick. + */ + if (drv->states[idx].target_residency_ns < TICK_NSEC && + tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + duration_ns = TICK_NSEC / 2; + +end: + /* + * Allow the tick to be stopped unless the selected state is a polling + * one or the expected idle duration is shorter than the tick period + * length. + */ + if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped()) + return idx; + + /* + * The tick is not going to be stopped, so if the target residency of + * the state to be returned is not within the time till the closest + * timer including the tick, try to correct that. + */ + if (idx > idx0 && + drv->states[idx].target_residency_ns > delta_tick) + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); + +out_tick: + *stop_tick = false; return idx; } diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index e36cbb920ec8..474d81831ad3 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -472,10 +472,11 @@ static void devfreq_monitor(struct work_struct *work) * devfreq_monitor_start() - Start load monitoring of devfreq instance * @devfreq: the devfreq instance. * - * Helper function for starting devfreq device load monitoring. By - * default delayed work based monitoring is supported. Function - * to be called from governor in response to DEVFREQ_GOV_START - * event when device is added to devfreq framework. + * Helper function for starting devfreq device load monitoring. By default, + * deferrable timer is used for load monitoring. But the users can change this + * behavior using the "timer" type in devfreq_dev_profile. This function will be + * called by devfreq governor in response to the DEVFREQ_GOV_START event + * generated while adding a device to the devfreq framework. */ void devfreq_monitor_start(struct devfreq *devfreq) { @@ -763,6 +764,7 @@ static void devfreq_dev_release(struct device *dev) dev_pm_opp_put_opp_table(devfreq->opp_table); mutex_destroy(&devfreq->lock); + srcu_cleanup_notifier_head(&devfreq->transition_notifier_list); kfree(devfreq); } diff --git a/drivers/devfreq/imx-bus.c b/drivers/devfreq/imx-bus.c index a727067980fb..86850b7dea09 100644 --- a/drivers/devfreq/imx-bus.c +++ b/drivers/devfreq/imx-bus.c @@ -7,7 +7,7 @@ #include <linux/devfreq.h> #include <linux/device.h> #include <linux/module.h> -#include <linux/of_device.h> +#include <linux/of.h> #include <linux/pm_opp.h> #include <linux/platform_device.h> #include <linux/slab.h> diff --git a/drivers/devfreq/imx8m-ddrc.c b/drivers/devfreq/imx8m-ddrc.c index 16636973eb10..e1348490c8aa 100644 --- a/drivers/devfreq/imx8m-ddrc.c +++ b/drivers/devfreq/imx8m-ddrc.c @@ -3,9 +3,9 @@ * Copyright 2019 NXP */ +#include <linux/mod_devicetable.h> #include <linux/module.h> #include <linux/device.h> -#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/devfreq.h> #include <linux/pm_opp.h> diff --git a/drivers/devfreq/mtk-cci-devfreq.c b/drivers/devfreq/mtk-cci-devfreq.c index 6354622eda65..83a73f0ccd80 100644 --- a/drivers/devfreq/mtk-cci-devfreq.c +++ b/drivers/devfreq/mtk-cci-devfreq.c @@ -8,7 +8,6 @@ #include <linux/minmax.h> #include <linux/module.h> #include <linux/of.h> -#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/pm_opp.h> #include <linux/regulator/consumer.h> diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 503376b894b6..4a4f0106ab9d 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -13,7 +13,7 @@ #include <linux/io.h> #include <linux/irq.h> #include <linux/module.h> -#include <linux/of_device.h> +#include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm_opp.h> #include <linux/reset.h> diff --git a/drivers/powercap/arm_scmi_powercap.c b/drivers/powercap/arm_scmi_powercap.c index 5231f6d52ae3..a081f177e702 100644 --- a/drivers/powercap/arm_scmi_powercap.c +++ b/drivers/powercap/arm_scmi_powercap.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/powercap.h> #include <linux/scmi_protocol.h> +#include <linux/slab.h> #define to_scmi_powercap_zone(z) \ container_of(z, struct scmi_powercap_zone, zone) @@ -19,6 +20,8 @@ static const struct scmi_powercap_proto_ops *powercap_ops; struct scmi_powercap_zone { + bool registered; + bool invalid; unsigned int height; struct device *dev; struct scmi_protocol_handle *ph; @@ -32,6 +35,7 @@ struct scmi_powercap_root { unsigned int num_zones; struct scmi_powercap_zone *spzones; struct list_head *registered_zones; + struct list_head scmi_zones; }; static struct powercap_control_type *scmi_top_pcntrl; @@ -271,12 +275,6 @@ static void scmi_powercap_unregister_all_zones(struct scmi_powercap_root *pr) } } -static inline bool -scmi_powercap_is_zone_registered(struct scmi_powercap_zone *spz) -{ - return !list_empty(&spz->node); -} - static inline unsigned int scmi_powercap_get_zone_height(struct scmi_powercap_zone *spz) { @@ -295,11 +293,46 @@ scmi_powercap_get_parent_zone(struct scmi_powercap_zone *spz) return &spz->spzones[spz->info->parent_id]; } +static int scmi_powercap_register_zone(struct scmi_powercap_root *pr, + struct scmi_powercap_zone *spz, + struct scmi_powercap_zone *parent) +{ + int ret = 0; + struct powercap_zone *z; + + if (spz->invalid) { + list_del(&spz->node); + return -EINVAL; + } + + z = powercap_register_zone(&spz->zone, scmi_top_pcntrl, spz->info->name, + parent ? &parent->zone : NULL, + &zone_ops, 1, &constraint_ops); + if (!IS_ERR(z)) { + spz->height = scmi_powercap_get_zone_height(spz); + spz->registered = true; + list_move(&spz->node, &pr->registered_zones[spz->height]); + dev_dbg(spz->dev, "Registered node %s - parent %s - height:%d\n", + spz->info->name, parent ? parent->info->name : "ROOT", + spz->height); + } else { + list_del(&spz->node); + ret = PTR_ERR(z); + dev_err(spz->dev, + "Error registering node:%s - parent:%s - h:%d - ret:%d\n", + spz->info->name, + parent ? parent->info->name : "ROOT", + spz->height, ret); + } + + return ret; +} + /** - * scmi_powercap_register_zone - Register an SCMI powercap zone recursively + * scmi_zones_register- Register SCMI powercap zones starting from parent zones * + * @dev: A reference to the SCMI device * @pr: A reference to the root powercap zones descriptors - * @spz: A reference to the SCMI powercap zone to register * * When registering SCMI powercap zones with the powercap framework we should * take care to always register zones starting from the root ones and to @@ -309,10 +342,10 @@ scmi_powercap_get_parent_zone(struct scmi_powercap_zone *spz) * zones provided by the SCMI platform firmware is built to comply with such * requirement. * - * This function, given an SCMI powercap zone to register, takes care to walk - * the SCMI powercap zones tree up to the root looking recursively for - * unregistered parent zones before registering the provided zone; at the same - * time each registered zone height in such a tree is accounted for and each + * This function, given the set of SCMI powercap zones to register, takes care + * to walk the SCMI powercap zones trees up to the root registering any + * unregistered parent zone before registering the child zones; at the same + * time each registered-zone height in such a tree is accounted for and each * zone, once registered, is stored in the @registered_zones array that is * indexed by zone height: this way will be trivial, at unregister time, to walk * the @registered_zones array backward and unregister all the zones starting @@ -330,57 +363,55 @@ scmi_powercap_get_parent_zone(struct scmi_powercap_zone *spz) * * Return: 0 on Success */ -static int scmi_powercap_register_zone(struct scmi_powercap_root *pr, - struct scmi_powercap_zone *spz) +static int scmi_zones_register(struct device *dev, + struct scmi_powercap_root *pr) { int ret = 0; - struct scmi_powercap_zone *parent; - - if (!spz->info) - return ret; + unsigned int sp = 0, reg_zones = 0; + struct scmi_powercap_zone *spz, **zones_stack; - parent = scmi_powercap_get_parent_zone(spz); - if (parent && !scmi_powercap_is_zone_registered(parent)) { - /* - * Bail out if a parent domain was marked as unsupported: - * only domains participating as leaves can be skipped. - */ - if (!parent->info) - return -ENODEV; + zones_stack = kcalloc(pr->num_zones, sizeof(spz), GFP_KERNEL); + if (!zones_stack) + return -ENOMEM; - ret = scmi_powercap_register_zone(pr, parent); - if (ret) - return ret; - } + spz = list_first_entry_or_null(&pr->scmi_zones, + struct scmi_powercap_zone, node); + while (spz) { + struct scmi_powercap_zone *parent; - if (!scmi_powercap_is_zone_registered(spz)) { - struct powercap_zone *z; - - z = powercap_register_zone(&spz->zone, - scmi_top_pcntrl, - spz->info->name, - parent ? &parent->zone : NULL, - &zone_ops, 1, &constraint_ops); - if (!IS_ERR(z)) { - spz->height = scmi_powercap_get_zone_height(spz); - list_add(&spz->node, - &pr->registered_zones[spz->height]); - dev_dbg(spz->dev, - "Registered node %s - parent %s - height:%d\n", - spz->info->name, - parent ? parent->info->name : "ROOT", - spz->height); - ret = 0; + parent = scmi_powercap_get_parent_zone(spz); + if (parent && !parent->registered) { + zones_stack[sp++] = spz; + spz = parent; } else { - ret = PTR_ERR(z); - dev_err(spz->dev, - "Error registering node:%s - parent:%s - h:%d - ret:%d\n", - spz->info->name, - parent ? parent->info->name : "ROOT", - spz->height, ret); + ret = scmi_powercap_register_zone(pr, spz, parent); + if (!ret) { + reg_zones++; + } else if (sp) { + /* Failed to register a non-leaf zone. + * Bail-out. + */ + dev_err(dev, + "Failed to register non-leaf zone - ret:%d\n", + ret); + scmi_powercap_unregister_all_zones(pr); + reg_zones = 0; + goto out; + } + /* Pick next zone to process */ + if (sp) + spz = zones_stack[--sp]; + else + spz = list_first_entry_or_null(&pr->scmi_zones, + struct scmi_powercap_zone, + node); } } +out: + kfree(zones_stack); + dev_info(dev, "Registered %d SCMI Powercap domains !\n", reg_zones); + return ret; } @@ -424,6 +455,8 @@ static int scmi_powercap_probe(struct scmi_device *sdev) if (!pr->registered_zones) return -ENOMEM; + INIT_LIST_HEAD(&pr->scmi_zones); + for (i = 0, spz = pr->spzones; i < pr->num_zones; i++, spz++) { /* * Powercap domains are validate by the protocol layer, i.e. @@ -438,6 +471,7 @@ static int scmi_powercap_probe(struct scmi_device *sdev) INIT_LIST_HEAD(&spz->node); INIT_LIST_HEAD(&pr->registered_zones[i]); + list_add_tail(&spz->node, &pr->scmi_zones); /* * Forcibly skip powercap domains using an abstract scale. * Note that only leaves domains can be skipped, so this could @@ -448,7 +482,7 @@ static int scmi_powercap_probe(struct scmi_device *sdev) dev_warn(dev, "Abstract power scale not supported. Skip %s.\n", spz->info->name); - spz->info = NULL; + spz->invalid = true; continue; } } @@ -457,21 +491,12 @@ static int scmi_powercap_probe(struct scmi_device *sdev) * Scan array of retrieved SCMI powercap domains and register them * recursively starting from the root domains. */ - for (i = 0, spz = pr->spzones; i < pr->num_zones; i++, spz++) { - ret = scmi_powercap_register_zone(pr, spz); - if (ret) { - dev_err(dev, - "Failed to register powercap zone %s - ret:%d\n", - spz->info->name, ret); - scmi_powercap_unregister_all_zones(pr); - return ret; - } - } + ret = scmi_zones_register(dev, pr); + if (ret) + return ret; dev_set_drvdata(dev, pr); - dev_info(dev, "Registered %d SCMI Powercap domains !\n", pr->num_zones); - return ret; } diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 8fac57b28f8a..f173fba828ef 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1485,7 +1485,7 @@ static int rapl_detect_domains(struct rapl_package *rp) } pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); - rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), + rp->domains = kcalloc(rp->nr_domains, sizeof(struct rapl_domain), GFP_KERNEL); if (!rp->domains) return -ENOMEM; diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 9a8151a2bdea..7c9b35448563 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -85,8 +85,6 @@ extern void pm_runtime_irq_safe(struct device *dev); extern void __pm_runtime_use_autosuspend(struct device *dev, bool use); extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); extern u64 pm_runtime_autosuspend_expiration(struct device *dev); -extern void pm_runtime_update_max_time_suspended(struct device *dev, - s64 delta_ns); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 77f4849e3418..6eb9adaef52b 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -194,6 +194,16 @@ static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec, #endif /* !CONFIG_PM_SLEEP */ +static inline bool device_awake_path(struct device *dev) +{ + return device_wakeup_path(dev); +} + +static inline void device_set_awake_path(struct device *dev) +{ + device_set_wakeup_path(dev); +} + static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { return pm_wakeup_ws_event(ws, msec, false); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 782d3b41c1f3..4244b069442e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -220,6 +220,11 @@ static struct pm_qos_constraints cpu_latency_constraints = { .type = PM_QOS_MIN, }; +static inline bool cpu_latency_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ @@ -263,7 +268,7 @@ static void cpu_latency_qos_apply(struct pm_qos_request *req, */ void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(value)) return; if (cpu_latency_qos_request_active(req)) { @@ -289,7 +294,7 @@ EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); */ void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(new_value)) return; if (!cpu_latency_qos_request_active(req)) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0415d5ecb977..87e9f7e2bdc0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -404,6 +404,7 @@ struct bm_position { struct mem_zone_bm_rtree *zone; struct rtree_node *node; unsigned long node_pfn; + unsigned long cur_pfn; int node_bit; }; @@ -589,6 +590,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; + bm->cur.cur_pfn = BM_END_OF_MAP; bm->cur.node_bit = 0; } @@ -799,6 +801,7 @@ node_found: bm->cur.zone = zone; bm->cur.node = node; bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.cur_pfn = pfn; /* Set return values */ *addr = node->data; @@ -850,6 +853,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm) clear_bit(bit, bm->cur.node->data); } +static unsigned long memory_bm_get_current(struct memory_bitmap *bm) +{ + return bm->cur.cur_pfn; +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -929,10 +937,12 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) if (bit < bits) { pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; bm->cur.node_bit = bit + 1; + bm->cur.cur_pfn = pfn; return pfn; } } while (rtree_next_node(bm)); + bm->cur.cur_pfn = BM_END_OF_MAP; return BM_END_OF_MAP; } @@ -1423,14 +1433,19 @@ static unsigned int count_data_pages(void) /* * This is needed, because copy_page and memcpy are not usable for copying - * task structs. + * task structs. Returns true if the page was filled with only zeros, + * otherwise false. */ -static inline void do_copy_page(long *dst, long *src) +static inline bool do_copy_page(long *dst, long *src) { + long z = 0; int n; - for (n = PAGE_SIZE / sizeof(long); n; n--) + for (n = PAGE_SIZE / sizeof(long); n; n--) { + z |= *src; *dst++ = *src++; + } + return !z; } /** @@ -1439,17 +1454,21 @@ static inline void do_copy_page(long *dst, long *src) * Check if the page we are going to copy is marked as present in the kernel * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() - * always returns 'true'. + * always returns 'true'. Returns true if the page was entirely composed of + * zeros, otherwise it will return false. */ -static void safe_copy_page(void *dst, struct page *s_page) +static bool safe_copy_page(void *dst, struct page *s_page) { + bool zeros_only; + if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); } else { hibernate_map_page(s_page); - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); hibernate_unmap_page(s_page); } + return zeros_only; } #ifdef CONFIG_HIGHMEM @@ -1459,17 +1478,18 @@ static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; + bool zeros_only; s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { src = kmap_atomic(s_page); dst = kmap_atomic(d_page); - do_copy_page(dst, src); + zeros_only = do_copy_page(dst, src); kunmap_atomic(dst); kunmap_atomic(src); } else { @@ -1478,30 +1498,39 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - safe_copy_page(buffer, s_page); + zeros_only = safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page); copy_page(dst, buffer); kunmap_atomic(dst); } else { - safe_copy_page(page_address(d_page), s_page); + zeros_only = safe_copy_page(page_address(d_page), s_page); } } + return zeros_only; } #else #define page_is_saveable(zone, pfn) saveable_page(zone, pfn) -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - safe_copy_page(page_address(pfn_to_page(dst_pfn)), + return safe_copy_page(page_address(pfn_to_page(dst_pfn)), pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -static void copy_data_pages(struct memory_bitmap *copy_bm, - struct memory_bitmap *orig_bm) +/* + * Copy data pages will copy all pages into pages pulled from the copy_bm. + * If a page was entirely filled with zeros it will be marked in the zero_bm. + * + * Returns the number of pages copied. + */ +static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm, + struct memory_bitmap *zero_bm) { + unsigned long copied_pages = 0; struct zone *zone; - unsigned long pfn; + unsigned long pfn, copy_pfn; for_each_populated_zone(zone) { unsigned long max_zone_pfn; @@ -1514,18 +1543,29 @@ static void copy_data_pages(struct memory_bitmap *copy_bm, } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); + copy_pfn = memory_bm_next_pfn(copy_bm); for(;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + if (copy_data_page(copy_pfn, pfn)) { + memory_bm_set_bit(zero_bm, pfn); + /* Use this copy_pfn for a page that is not full of zeros */ + continue; + } + copied_pages++; + copy_pfn = memory_bm_next_pfn(copy_bm); } + return copied_pages; } /* Total number of image pages */ static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* Number of zero pages */ +static unsigned int nr_zero_pages; + /* * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. @@ -1546,6 +1586,9 @@ static struct memory_bitmap orig_bm; */ static struct memory_bitmap copy_bm; +/* Memory bitmap which tracks which saveable pages were zero filled. */ +static struct memory_bitmap zero_bm; + /** * swsusp_free - Free pages allocated for hibernation image. * @@ -1590,6 +1633,7 @@ loop: out: nr_copy_pages = 0; nr_meta_pages = 0; + nr_zero_pages = 0; restore_pblist = NULL; buffer = NULL; alloc_normal = 0; @@ -1808,8 +1852,15 @@ int hibernate_preallocate_memory(void) goto err_out; } + error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate zero bitmap\n"); + goto err_out; + } + alloc_normal = 0; alloc_highmem = 0; + nr_zero_pages = 0; /* Count the number of saveable data pages. */ save_highmem = count_highmem_pages(); @@ -2089,19 +2140,19 @@ asmlinkage __visible int swsusp_save(void) * Kill them. */ drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); + nr_copy_pages = copy_data_pages(©_bm, &orig_bm, &zero_bm); /* * End of critical section. From now on, we can write to memory, * but we should not touch disk. This specially means we must _not_ * touch swap space! Except we must write out our image of course. */ - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; + /* We don't actually copy the zero pages */ + nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied)\n", nr_pages); + pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); return 0; } @@ -2146,15 +2197,22 @@ static int init_header(struct swsusp_info *info) return init_header_complete(info); } +#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1)) +#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG) + /** * pack_pfns - Prepare PFNs for saving. * @bm: Memory bitmap. * @buf: Memory buffer to store the PFNs in. + * @zero_bm: Memory bitmap containing PFNs of zero pages. * * PFNs corresponding to set bits in @bm are stored in the area of memory - * pointed to by @buf (1 page at a time). + * pointed to by @buf (1 page at a time). Pages which were filled with only + * zeros will have the highest bit set in the packed format to distinguish + * them from PFNs which will be contained in the image file. */ -static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { int j; @@ -2162,6 +2220,8 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + if (memory_bm_test_bit(zero_bm, buf[j])) + buf[j] |= ENCODED_PFN_ZERO_FLAG; } } @@ -2203,7 +2263,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { clear_page(buffer); - pack_pfns(buffer, &orig_bm); + pack_pfns(buffer, &orig_bm, &zero_bm); } else { struct page *page; @@ -2299,24 +2359,35 @@ static int load_header(struct swsusp_info *info) * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. * @bm: Memory bitmap. * @buf: Area of memory containing the PFNs. + * @zero_bm: Memory bitmap with the zero PFNs marked. * * For each element of the array pointed to by @buf (1 page at a time), set the - * corresponding bit in @bm. + * corresponding bit in @bm. If the page was originally populated with only + * zeros then a corresponding bit will also be set in @zero_bm. */ -static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { + unsigned long decoded_pfn; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { if (unlikely(buf[j] == BM_END_OF_MAP)) break; - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) { - memory_bm_set_bit(bm, buf[j]); + zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG); + decoded_pfn = buf[j] & ENCODED_PFN_MASK; + if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) { + memory_bm_set_bit(bm, decoded_pfn); + if (zero) { + memory_bm_set_bit(zero_bm, decoded_pfn); + nr_zero_pages++; + } } else { - if (!pfn_valid(buf[j])) + if (!pfn_valid(decoded_pfn)) pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", - (unsigned long long)PFN_PHYS(buf[j])); + (unsigned long long)PFN_PHYS(decoded_pfn)); return -EFAULT; } } @@ -2538,6 +2609,7 @@ static inline void free_highmem_data(void) {} * prepare_image - Make room for loading hibernation image. * @new_bm: Uninitialized memory bitmap structure. * @bm: Memory bitmap with unsafe pages marked. + * @zero_bm: Memory bitmap containing the zero pages. * * Use @bm to mark the pages that will be overwritten in the process of * restoring the system memory state from the suspend image ("unsafe" pages) @@ -2548,10 +2620,15 @@ static inline void free_highmem_data(void) {} * pages will be used for just yet. Instead, we mark them all as allocated and * create a lists of "safe" pages to be used later. On systems with high * memory a list of "safe" highmem pages is created too. + * + * Because it was not known which pages were unsafe when @zero_bm was created, + * make a copy of it and recreate it within safe pages. */ -static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { unsigned int nr_pages, nr_highmem; + struct memory_bitmap tmp; struct linked_page *lp; int error; @@ -2568,6 +2645,24 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + + /* Make a copy of zero_bm so it can be created in safe pages */ + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + if (error) + goto Free; + + duplicate_memory_bitmap(&tmp, zero_bm); + memory_bm_free(zero_bm, PG_UNSAFE_KEEP); + + /* Recreate zero_bm in safe pages */ + error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(zero_bm, &tmp); + memory_bm_free(&tmp, PG_UNSAFE_KEEP); + /* At this point zero_bm is in safe pages and it can be used for restoring. */ + if (nr_highmem > 0) { error = prepare_highmem_image(bm, &nr_highmem); if (error) @@ -2582,7 +2677,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) * * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { lp = get_image_page(GFP_ATOMIC, PG_SAFE); @@ -2595,7 +2690,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) nr_pages--; } /* Preallocate memory for the image */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -2683,8 +2778,9 @@ int snapshot_write_next(struct snapshot_handle *handle) static struct chain_allocator ca; int error = 0; +next: /* Check if we have already loaded the entire image */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) return 0; handle->sync_read = 1; @@ -2709,19 +2805,26 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + nr_zero_pages = 0; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { - error = unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm, &zero_bm); if (error) return error; if (handle->cur == nr_meta_pages + 1) { - error = prepare_image(&orig_bm, ©_bm); + error = prepare_image(&orig_bm, ©_bm, &zero_bm); if (error) return error; chain_init(&ca, GFP_ATOMIC, PG_SAFE); memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(&zero_bm); restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; @@ -2738,6 +2841,14 @@ int snapshot_write_next(struct snapshot_handle *handle) handle->sync_read = 0; } handle->cur++; + + /* Zero pages were not included in the image, memset it and move on. */ + if (handle->cur > nr_meta_pages + 1 && + memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) { + memset(handle->buffer, 0, PAGE_SIZE); + goto next; + } + return PAGE_SIZE; } @@ -2754,7 +2865,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) copy_last_highmem_page(); hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { memory_bm_recycle(&orig_bm); free_highmem_data(); } @@ -2763,7 +2874,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) int snapshot_image_loaded(struct snapshot_handle *handle) { return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); + handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages); } #ifdef CONFIG_HIGHMEM diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 59bfa05dec5d..dc531805a570 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -53,7 +53,7 @@ DESTDIR ?= VERSION:= $(shell ./utils/version-gen.sh) LIB_MAJ= 0.0.1 -LIB_MIN= 0 +LIB_MIN= 1 PACKAGE = cpupower PACKAGE_BUGREPORT = linux-pm@vger.kernel.org diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index 3f7d0c0c5067..7a2ef691b20e 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -14,6 +14,13 @@ #include "cpupower.h" #include "cpupower_intern.h" +int is_valid_path(const char *path) +{ + if (access(path, F_OK) == -1) + return 0; + return 1; +} + unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen) { ssize_t numread; diff --git a/tools/power/cpupower/lib/cpupower_intern.h b/tools/power/cpupower/lib/cpupower_intern.h index ac1112b956ec..5fdb8620d41b 100644 --- a/tools/power/cpupower/lib/cpupower_intern.h +++ b/tools/power/cpupower/lib/cpupower_intern.h @@ -7,5 +7,6 @@ #define SYSFS_PATH_MAX 255 +int is_valid_path(const char *path); unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen); unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen); diff --git a/tools/power/cpupower/utils/cpuidle-set.c b/tools/power/cpupower/utils/cpuidle-set.c index 46158928f9ad..a551d1d4ac51 100644 --- a/tools/power/cpupower/utils/cpuidle-set.c +++ b/tools/power/cpupower/utils/cpuidle-set.c @@ -41,14 +41,6 @@ int cmd_idle_set(int argc, char **argv) cont = 0; break; case 'd': - if (param) { - param = -1; - cont = 0; - break; - } - param = ret; - idlestate = atoi(optarg); - break; case 'e': if (param) { param = -1; @@ -56,7 +48,13 @@ int cmd_idle_set(int argc, char **argv) break; } param = ret; - idlestate = atoi(optarg); + strtol(optarg, &endptr, 10); + if (*endptr != '\0') { + printf(_("Bad value: %s, Integer expected\n"), optarg); + exit(EXIT_FAILURE); + } else { + idlestate = atoi(optarg); + } break; case 'D': if (param) { diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 180d5ba877e6..0677b58374ab 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -18,6 +18,9 @@ static struct option set_opts[] = { {"perf-bias", required_argument, NULL, 'b'}, + {"epp", required_argument, NULL, 'e'}, + {"amd-pstate-mode", required_argument, NULL, 'm'}, + {"turbo-boost", required_argument, NULL, 't'}, { }, }; @@ -37,11 +40,15 @@ int cmd_set(int argc, char **argv) union { struct { int perf_bias:1; + int epp:1; + int mode:1; + int turbo_boost:1; }; int params; } params; - int perf_bias = 0; + int perf_bias = 0, turbo_boost = 1; int ret = 0; + char epp[30], mode[20]; ret = uname(&uts); if (!ret && (!strcmp(uts.machine, "ppc64le") || @@ -55,7 +62,7 @@ int cmd_set(int argc, char **argv) params.params = 0; /* parameter parsing */ - while ((ret = getopt_long(argc, argv, "b:", + while ((ret = getopt_long(argc, argv, "b:e:m:", set_opts, NULL)) != -1) { switch (ret) { case 'b': @@ -69,6 +76,38 @@ int cmd_set(int argc, char **argv) } params.perf_bias = 1; break; + case 'e': + if (params.epp) + print_wrong_arg_exit(); + if (sscanf(optarg, "%29s", epp) != 1) { + print_wrong_arg_exit(); + return -EINVAL; + } + params.epp = 1; + break; + case 'm': + if (cpupower_cpu_info.vendor != X86_VENDOR_AMD) + print_wrong_arg_exit(); + if (params.mode) + print_wrong_arg_exit(); + if (sscanf(optarg, "%19s", mode) != 1) { + print_wrong_arg_exit(); + return -EINVAL; + } + params.mode = 1; + break; + case 't': + if (params.turbo_boost) + print_wrong_arg_exit(); + turbo_boost = atoi(optarg); + if (turbo_boost < 0 || turbo_boost > 1) { + printf("--turbo-boost param out of range [0-1]\n"); + print_wrong_arg_exit(); + } + params.turbo_boost = 1; + break; + + default: print_wrong_arg_exit(); } @@ -77,6 +116,18 @@ int cmd_set(int argc, char **argv) if (!params.params) print_wrong_arg_exit(); + if (params.mode) { + ret = cpupower_set_amd_pstate_mode(mode); + if (ret) + fprintf(stderr, "Error setting mode\n"); + } + + if (params.turbo_boost) { + ret = cpupower_set_turbo_boost(turbo_boost); + if (ret) + fprintf(stderr, "Error setting turbo-boost\n"); + } + /* Default is: set all CPUs */ if (bitmask_isallclear(cpus_chosen)) bitmask_setall(cpus_chosen); @@ -102,6 +153,16 @@ int cmd_set(int argc, char **argv) break; } } + + if (params.epp) { + ret = cpupower_set_epp(cpu, epp); + if (ret) { + fprintf(stderr, + "Error setting epp value on CPU %d\n", cpu); + break; + } + } + } return ret; } diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 96e4bede078b..95749b8ee475 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -116,6 +116,10 @@ extern int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val); extern int cpupower_intel_get_perf_bias(unsigned int cpu); extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu); +extern int cpupower_set_epp(unsigned int cpu, char *epp); +extern int cpupower_set_amd_pstate_mode(char *mode); +extern int cpupower_set_turbo_boost(int turbo_boost); + /* Read/Write msr ****************************/ /* PCI stuff ****************************/ @@ -173,6 +177,13 @@ static inline int cpupower_intel_get_perf_bias(unsigned int cpu) static inline unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu) { return 0; }; +static inline int cpupower_set_epp(unsigned int cpu, char *epp) +{ return -1; }; +static inline int cpupower_set_amd_pstate_mode(char *mode) +{ return -1; }; +static inline int cpupower_set_turbo_boost(int turbo_boost) +{ return -1; }; + /* Read/Write msr ****************************/ static inline int cpufreq_has_boost_support(unsigned int cpu, int *support, diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 9547b29254a7..76e461ff4f74 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -87,6 +87,61 @@ int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) return 0; } +int cpupower_set_epp(unsigned int cpu, char *epp) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[30] = {}; + + snprintf(path, sizeof(path), + PATH_TO_CPU "cpu%u/cpufreq/energy_performance_preference", cpu); + + if (!is_valid_path(path)) + return -1; + + snprintf(linebuf, sizeof(linebuf), "%s", epp); + + if (cpupower_write_sysfs(path, linebuf, 30) <= 0) + return -1; + + return 0; +} + +int cpupower_set_amd_pstate_mode(char *mode) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[20] = {}; + + snprintf(path, sizeof(path), PATH_TO_CPU "amd_pstate/status"); + + if (!is_valid_path(path)) + return -1; + + snprintf(linebuf, sizeof(linebuf), "%s\n", mode); + + if (cpupower_write_sysfs(path, linebuf, 20) <= 0) + return -1; + + return 0; +} + +int cpupower_set_turbo_boost(int turbo_boost) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[2] = {}; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); + + if (!is_valid_path(path)) + return -1; + + snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost); + + if (cpupower_write_sysfs(path, linebuf, 2) <= 0) + return -1; + + return 0; +} + bool cpupower_amd_pstate_enabled(void) { char *driver = cpufreq_get_driver(0); @@ -95,7 +150,7 @@ bool cpupower_amd_pstate_enabled(void) if (!driver) return ret; - if (!strcmp(driver, "amd-pstate")) + if (!strncmp(driver, "amd", 3)) ret = true; cpufreq_put_driver(driver); |