diff options
authorLinus Torvalds <>2018-05-11 19:49:02 +0300
committerLinus Torvalds <>2018-05-11 19:49:02 +0300
commit41e3e1082367221e99a59c8968a583706123ae04 (patch)
parente03dc5d3d427ed0114258ba00b16277e705e2c0d (diff)
parentef050374e1eedec45bd260e0ac9eb98f699267d2 (diff)
Merge tag 'pm-4.17-rc5' of git://
Pull power management fixes from Rafael Wysocki: "These fix two PCI power management regressions from the 4.13 cycle and one cpufreq schedutil governor bug introduced during the 4.12 cycle, drop a stale comment from the schedutil code and fix two mistakes in docs. Specifics: - Restore device_may_wakeup() check in pci_enable_wake() removed inadvertently during the 4.13 cycle to prevent systems from drawing excessive power when suspended or off, among other things (Rafael Wysocki). - Fix pci_dev_run_wake() to properly handle devices that only can signal PME# when in the D3cold power state (Kai Heng Feng). - Fix the schedutil cpufreq governor to avoid using UINT_MAX as the new CPU frequency in some cases due to a missing check (Rafael Wysocki). - Remove a stale comment regarding worker kthreads from the schedutil cpufreq governor (Juri Lelli). - Fix a copy-paste mistake in the intel_pstate driver documentation (Juri Lelli). - Fix a typo in the system sleep states documentation (Jonathan Neuschäfer)" * tag 'pm-4.17-rc5' of git:// PCI / PM: Check device_may_wakeup() in pci_enable_wake() PCI / PM: Always check PME wakeup capability for runtime wakeup support cpufreq: schedutil: Avoid using invalid next_freq cpufreq: schedutil: remove stale comment PM: docs: intel_pstate: fix Active Mode w/o HWP paragraph PM: docs: sleep-states: Fix a typo ("includig")
4 files changed, 31 insertions, 26 deletions
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index d2b6fda3d67b..ab2fe0eda1d7 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -145,7 +145,7 @@ feature enabled.]
In this mode ``intel_pstate`` registers utilization update callbacks with the
CPU scheduler in order to run a P-state selection algorithm, either
-``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy
+``powersave`` or ``performance``, depending on the ``scaling_governor`` policy
setting in ``sysfs``. The current CPU frequency information to be made
available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is
periodically updated by those utilization update callbacks too.
diff --git a/Documentation/admin-guide/pm/sleep-states.rst b/Documentation/admin-guide/pm/sleep-states.rst
index 1e5c0f00cb2f..dbf5acd49f35 100644
--- a/Documentation/admin-guide/pm/sleep-states.rst
+++ b/Documentation/admin-guide/pm/sleep-states.rst
@@ -15,7 +15,7 @@ Sleep States That Can Be Supported
Depending on its configuration and the capabilities of the platform it runs on,
-the Linux kernel can support up to four system sleep states, includig
+the Linux kernel can support up to four system sleep states, including
hibernation and up to three variants of system suspend. The sleep states that
can be supported by the kernel are listed below.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a04197ce767d..dbfe7c4f3776 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1910,7 +1910,7 @@ void pci_pme_active(struct pci_dev *dev, bool enable)
- * pci_enable_wake - enable PCI device as wakeup event source
+ * __pci_enable_wake - enable PCI device as wakeup event source
* @dev: PCI device affected
* @state: PCI state from which device will issue wakeup events
* @enable: True to enable event generation; false to disable
@@ -1928,7 +1928,7 @@ EXPORT_SYMBOL(pci_pme_active);
* Error code depending on the platform is returned if both the platform and
* the native mechanism fail to enable the generation of wake-up events
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
+static int __pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
int ret = 0;
@@ -1969,6 +1969,23 @@ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
return ret;
+ * pci_enable_wake - change wakeup settings for a PCI device
+ * @pci_dev: Target device
+ * @state: PCI state from which device will issue wakeup events
+ * @enable: Whether or not to enable event generation
+ *
+ * If @enable is set, check device_may_wakeup() for the device before calling
+ * __pci_enable_wake() for it.
+ */
+int pci_enable_wake(struct pci_dev *pci_dev, pci_power_t state, bool enable)
+ if (enable && !device_may_wakeup(&pci_dev->dev))
+ return -EINVAL;
+ return __pci_enable_wake(pci_dev, state, enable);
@@ -1981,9 +1998,9 @@ EXPORT_SYMBOL(pci_enable_wake);
* should not be called twice in a row to enable wake-up due to PCI PM vs ACPI
* ordering constraints.
- * This function only returns error code if the device is not capable of
- * generating PME# from both D3_hot and D3_cold, and the platform is unable to
- * enable wake-up power for it.
+ * This function only returns error code if the device is not allowed to wake
+ * up the system from sleep or it is not capable of generating PME# from both
+ * D3_hot and D3_cold and the platform is unable to enable wake-up power for it.
int pci_wake_from_d3(struct pci_dev *dev, bool enable)
@@ -2114,7 +2131,7 @@ int pci_finish_runtime_suspend(struct pci_dev *dev)
dev->runtime_d3cold = target_state == PCI_D3cold;
- pci_enable_wake(dev, target_state, pci_dev_run_wake(dev));
+ __pci_enable_wake(dev, target_state, pci_dev_run_wake(dev));
error = pci_set_power_state(dev, target_state);
@@ -2138,16 +2155,16 @@ bool pci_dev_run_wake(struct pci_dev *dev)
struct pci_bus *bus = dev->bus;
- if (device_can_wakeup(&dev->dev))
- return true;
if (!dev->pme_support)
return false;
/* PME-capable in principle, but not from the target power state */
- if (!pci_pme_capable(dev, pci_target_state(dev, false)))
+ if (!pci_pme_capable(dev, pci_target_state(dev, true)))
return false;
+ if (device_can_wakeup(&dev->dev))
+ return true;
while (bus->parent) {
struct pci_dev *bridge = bus->self;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d2c6083304b4..e13df951aca7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -305,7 +305,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
- if (busy && next_f < sg_policy->next_freq) {
+ if (busy && next_f < sg_policy->next_freq &&
+ sg_policy->next_freq != UINT_MAX) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
@@ -396,19 +397,6 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- /*
- * For RT tasks, the schedutil governor shoots the frequency to maximum.
- * Special care must be taken to ensure that this kthread doesn't result
- * in the same behavior.
- *
- * This is (mostly) guaranteed by the work_in_progress flag. The flag is
- * updated only at the end of the sugov_work() function and before that
- * the schedutil governor rejects all other frequency scaling requests.
- *
- * There is a very rare case though, where the RT thread yields right
- * after the work_in_progress flag is cleared. The effects of that are
- * neglected for now.
- */
kthread_queue_work(&sg_policy->worker, &sg_policy->work);