summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/smpboot.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 00:45:53 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 00:45:53 +0300
commit88afbb21d4b36fee6acaa167641f9f0fc122f01b (patch)
tree09666f26faa124138506c7bf4970bdb43bbe8fdc /arch/x86/kernel/smpboot.c
parentcd336f6562d3d7646a9cf071b902db200a1dd77b (diff)
parent45e34c8af58f23db4474e2bfe79183efec09a18b (diff)
downloadlinux-88afbb21d4b36fee6acaa167641f9f0fc122f01b.tar.xz
Merge tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 core updates from Thomas Gleixner: "A set of fixes for kexec(), reboot and shutdown issues: - Ensure that the WBINVD in stop_this_cpu() has been completed before the control CPU proceedes. stop_this_cpu() is used for kexec(), reboot and shutdown to park the APs in a HLT loop. The control CPU sends an IPI to the APs and waits for their CPU online bits to be cleared. Once they all are marked "offline" it proceeds. But stop_this_cpu() clears the CPU online bit before issuing WBINVD, which means there is no guarantee that the AP has reached the HLT loop. This was reported to cause intermittent reboot/shutdown failures due to some dubious interaction with the firmware. This is not only a problem of WBINVD. The code to actually "stop" the CPU which runs between clearing the online bit and reaching the HLT loop can cause large enough delays on its own (think virtualization). That's especially dangerous for kexec() as kexec() expects that all APs are in a safe state and not executing code while the boot CPU jumps to the new kernel. There are more issues vs kexec() which are addressed separately. Cure this by implementing an explicit synchronization point right before the AP reaches HLT. This guarantees that the AP has completed the full stop proceedure. - Fix the condition for WBINVD in stop_this_cpu(). The WBINVD in stop_this_cpu() is required for ensuring that when switching to or from memory encryption no dirty data is left in the cache lines which might cause a write back in the wrong more later. This checks CPUID directly because the feature bit might have been cleared due to a command line option. But that CPUID check accesses leaf 0x8000001f::EAX unconditionally. Intel CPUs return the content of the highest supported leaf when a non-existing leaf is read, while AMD CPUs return all zeros for unsupported leafs. So the result of the test on Intel CPUs is lottery and on AMD its just correct by chance. While harmless it's incorrect and causes the conditional wbinvd() to be issued where not required, which caused the above issue to be unearthed. - Make kexec() robust against AP code execution Ashok observed triple faults when doing kexec() on a system which had been booted with "nosmt". It turned out that the SMT siblings which had been brought up partially are parked in mwait_play_dead() to enable power savings. mwait_play_dead() is monitoring the thread flags of the AP's idle task, which has been chosen as it's unlikely to be written to. But kexec() can overwrite the previous kernel text and data including page tables etc. When it overwrites the cache lines monitored by an AP that AP resumes execution after the MWAIT on eventually overwritten text, stack and page tables, which obviously might end up in a triple fault easily. Make this more robust in several steps: 1) Use an explicit per CPU cache line for monitoring. 2) Write a command to these cache lines to kick APs out of MWAIT before proceeding with kexec(), shutdown or reboot. The APs confirm the wakeup by writing status back and then enter a HLT loop. 3) If the system uses INIT/INIT/STARTUP for AP bringup, park the APs in INIT state. HLT is not a guarantee that an AP won't wake up and resume execution. HLT is woken up by NMI and SMI. SMI puts the CPU back into HLT (+/- firmware bugs), but NMI is delivered to the CPU which executes the NMI handler. Same issue as the MWAIT scenario described above. Sending an INIT/INIT sequence to the APs puts them into wait for STARTUP state, which is safe against NMI. There is still an issue remaining which can't be fixed: #MCE If the AP sits in HLT and receives a broadcast #MCE it will try to handle it with the obvious consequences. INIT/INIT clears CR4.MCE in the AP which will cause a broadcast #MCE to shut down the machine. So there is a choice between fire (HLT) and frying pan (INIT). Frying pan has been chosen as it's at least preventing the NMI issue. On systems which are not using INIT/INIT/STARTUP there is not much which can be done right now, but at least the obvious and easy to trigger MWAIT issue has been addressed" * tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/smp: Put CPUs into INIT on shutdown if possible x86/smp: Split sending INIT IPI out into a helper function x86/smp: Cure kexec() vs. mwait_play_dead() breakage x86/smp: Use dedicated cache-line for mwait_play_dead() x86/smp: Remove pointless wmb()s from native_stop_other_cpus() x86/smp: Dont access non-existing CPUID leaf x86/smp: Make stop_other_cpus() more robust
Diffstat (limited to 'arch/x86/kernel/smpboot.c')
-rw-r--r--arch/x86/kernel/smpboot.c151
1 files changed, 113 insertions, 38 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8de80608fd01..8779a7ed3e87 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -109,6 +110,20 @@ struct cpumask __cpu_primary_thread_mask __read_mostly;
/* Representing CPUs for which sibling maps can be computed */
static cpumask_var_t cpu_sibling_setup_mask;
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
+
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
+
/* Logical package management. We might want to allocate that dynamically */
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
@@ -166,6 +181,10 @@ static void ap_starting(void)
{
int cpuid = smp_processor_id();
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If woken up by an INIT in an 82489DX configuration the alive
* synchronization guarantees that the CPU does not reach this
@@ -828,47 +847,40 @@ static void __init smp_quirk_init_udelay(void)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+static void send_init_sequence(int phys_apicid)
{
- unsigned long send_status = 0, accept_status = 0;
- int maxlvt, num_starts, j;
-
- preempt_disable();
- maxlvt = lapic_get_maxlvt();
+ int maxlvt = lapic_get_maxlvt();
- /*
- * Be paranoid about clearing APIC errors.
- */
+ /* Be paranoid about clearing APIC errors. */
if (APIC_INTEGRATED(boot_cpu_apic_version)) {
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
- pr_debug("Asserting INIT\n");
-
- /*
- * Turn INIT on target chip
- */
- /*
- * Send IPI
- */
- apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
- phys_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+ /* Assert INIT on the target CPU */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
udelay(init_udelay);
- pr_debug("Deasserting INIT\n");
-
- /* Target chip */
- /* Send IPI */
+ /* Deassert INIT on the target CPU */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
+}
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int num_starts, j, maxlvt;
+
+ preempt_disable();
+ maxlvt = lapic_get_maxlvt();
+ send_init_sequence(phys_apicid);
mb();
@@ -1330,6 +1342,25 @@ void arch_thaw_secondary_cpus_end(void)
cache_aps_init();
}
+bool smp_park_other_cpus_in_init(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ unsigned int apicid;
+
+ if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu)
+ return false;
+
+ for_each_present_cpu(cpu) {
+ if (cpu == this_cpu)
+ continue;
+ apicid = apic->cpu_present_to_apicid(cpu);
+ if (apicid == BAD_APICID)
+ continue;
+ send_init_sequence(apicid);
+ }
+ return true;
+}
+
/*
* Early setup to make printk work.
*/
@@ -1595,10 +1626,10 @@ void play_dead_common(void)
*/
static inline void mwait_play_dead(void)
{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
- void *mwait_ptr;
int i;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
@@ -1633,12 +1664,9 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
- /*
- * This should be a memory location in a cache line which is
- * unlikely to be touched by other processors. The actual
- * content is immaterial as it is not actually modified in any way.
- */
- mwait_ptr = &current_thread_info()->flags;
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
wbinvd();
@@ -1651,11 +1679,58 @@ static inline void mwait_play_dead(void)
* case where we return around the loop.
*/
mb();
- clflush(mwait_ptr);
+ clflush(md);
mb();
- __monitor(mwait_ptr, 0, 0);
+ __monitor(md, 0, 0);
mb();
__mwait(eax, 0);
+
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+ }
+}
+
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
}
}