summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/sleep.c9
-rw-r--r--arch/x86/kernel/alternative.c470
-rw-r--r--arch/x86/kernel/amd_nb.c50
-rw-r--r--arch/x86/kernel/apic/apic.c40
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c318
-rw-r--r--arch/x86/kernel/callthunks.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/bugs.c51
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c21
-rw-r--r--arch/x86/kernel/cpu/common.c132
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c6
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c171
-rw-r--r--arch/x86/kernel/cpu/topology.c5
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/fpu/context.h2
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/fpu/init.c8
-rw-r--r--arch/x86/kernel/head_32.S14
-rw-r--r--arch/x86/kernel/head_64.S103
-rw-r--r--arch/x86/kernel/irq.c7
-rw-r--r--arch/x86/kernel/process.c28
-rw-r--r--arch/x86/kernel/sev-shared.c103
-rw-r--r--arch/x86/kernel/sev.c258
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/smp.c107
-rw-r--r--arch/x86/kernel/smpboot.c665
-rw-r--r--arch/x86/kernel/topology.c98
-rw-r--r--arch/x86/kernel/tsc.c20
-rw-r--r--arch/x86/kernel/tsc_sync.c36
-rw-r--r--arch/x86/kernel/unwind_orc.c3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/x86_init.c1
35 files changed, 1588 insertions, 1171 deletions
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 1328c221af30..6dfecb27b846 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
#include <asm/cacheflush.h>
#include <asm/realmode.h>
#include <asm/hypervisor.h>
+#include <asm/smp.h>
#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
@@ -127,7 +128,13 @@ int x86_acpi_suspend_lowlevel(void)
* value is in the actual %rsp register.
*/
current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
- smpboot_control = smp_processor_id();
+ /*
+ * Ensure the CPU knows which one it is when it comes back, if
+ * it isn't in parallel mode and expected to work that out for
+ * itself.
+ */
+ if (!(smpboot_control & STARTUP_PARALLEL_MASK))
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0L;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f615e0cb6d93..a7e1ec50ad29 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
#define MAX_PATCH_LEN (255-1)
-static int __initdata_or_module debug_alternative;
+#define DA_ALL (~0)
+#define DA_ALT 0x01
+#define DA_RET 0x02
+#define DA_RETPOLINE 0x04
+#define DA_ENDBR 0x08
+#define DA_SMP 0x10
+
+static unsigned int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
{
- debug_alternative = 1;
+ if (str && *str == '=')
+ str++;
+
+ if (!str || kstrtouint(str, 0, &debug_alternative))
+ debug_alternative = DA_ALL;
+
return 1;
}
__setup("debug-alternative", debug_alt);
@@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);
-#define DPRINTK(fmt, args...) \
+#define DPRINTK(type, fmt, args...) \
do { \
- if (debug_alternative) \
+ if (debug_alternative & DA_##type) \
printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
} while (0)
-#define DUMP_BYTES(buf, len, fmt, args...) \
+#define DUMP_BYTES(type, buf, len, fmt, args...) \
do { \
- if (unlikely(debug_alternative)) { \
+ if (unlikely(debug_alternative & DA_##type)) { \
int j; \
\
if (!(len)) \
@@ -86,6 +98,11 @@ static const unsigned char x86nops[] =
BYTES_NOP6,
BYTES_NOP7,
BYTES_NOP8,
+#ifdef CONFIG_64BIT
+ BYTES_NOP9,
+ BYTES_NOP10,
+ BYTES_NOP11,
+#endif
};
const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
@@ -99,19 +116,44 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
x86nops + 1 + 2 + 3 + 4 + 5,
x86nops + 1 + 2 + 3 + 4 + 5 + 6,
x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+#endif
};
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
+/*
+ * Fill the buffer with a single effective instruction of size @len.
+ *
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void __init_or_module add_nop(u8 *instr, unsigned int len)
{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
+ u8 *target = instr + len;
+
+ if (!len)
+ return;
+
+ if (len <= ASM_NOP_MAX) {
+ memcpy(instr, x86_nops[len], len);
+ return;
}
+
+ if (len < 128) {
+ __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+ instr += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+ instr += JMP32_INSN_SIZE;
+ }
+
+ for (;instr < target; instr++)
+ *instr = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -123,133 +165,223 @@ extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
/*
- * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ * Matches NOP and NOPL, not any of the other possible NOPs.
*/
-static inline bool is_jmp(const u8 opcode)
+static bool insn_is_nop(struct insn *insn)
{
- return opcode == 0xeb || opcode == 0xe9;
+ /* Anything NOP, but no REP NOP */
+ if (insn->opcode.bytes[0] == 0x90 &&
+ (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
+ return true;
+
+ /* NOPL */
+ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+ return true;
+
+ /* TODO: more nops */
+
+ return false;
}
-static void __init_or_module
-recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
+/*
+ * Find the offset of the first non-NOP instruction starting at @offset
+ * but no further than @len.
+ */
+static int skip_nops(u8 *instr, int offset, int len)
{
- u8 *next_rip, *tgt_rip;
- s32 n_dspl, o_dspl;
- int repl_len;
+ struct insn insn;
- if (a->replacementlen != 5)
- return;
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &instr[offset]))
+ break;
- o_dspl = *(s32 *)(insn_buff + 1);
+ if (!insn_is_nop(&insn))
+ break;
+ }
- /* next_rip of the replacement JMP */
- next_rip = repl_insn + a->replacementlen;
- /* target rip of the replacement JMP */
- tgt_rip = next_rip + o_dspl;
- n_dspl = tgt_rip - orig_insn;
+ return offset;
+}
- DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+/*
+ * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
+ * to the end of the NOP sequence into a single NOP.
+ */
+static bool __init_or_module
+__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
+{
+ int i = *next - insn->length;
- if (tgt_rip - orig_insn >= 0) {
- if (n_dspl - 2 <= 127)
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
- /* negative offset */
- } else {
- if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
+ switch (insn->opcode.bytes[0]) {
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ *prev = i;
+ *target = *next + insn->immediate.value;
+ return false;
}
-two_byte_jmp:
- n_dspl -= 2;
+ if (insn_is_nop(insn)) {
+ int nop = i;
- insn_buff[0] = 0xeb;
- insn_buff[1] = (s8)n_dspl;
- add_nops(insn_buff + 2, 3);
+ *next = skip_nops(instr, *next, len);
+ if (*target && *next == *target)
+ nop = *prev;
- repl_len = 2;
- goto done;
+ add_nop(instr + nop, *next - nop);
+ DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
+ return true;
+ }
+
+ *target = 0;
+ return false;
+}
-five_byte_jmp:
- n_dspl -= 5;
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+{
+ int prev, target = 0;
- insn_buff[0] = 0xe9;
- *(s32 *)&insn_buff[1] = n_dspl;
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;
- repl_len = 5;
+ if (insn_decode_kernel(&insn, &instr[i]))
+ return;
-done:
+ next = i + insn.length;
- DPRINTK("final displ: 0x%08x, JMP 0x%lx",
- n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+ __optimize_nops(instr, len, &insn, &next, &prev, &target);
+ }
}
/*
- * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
*
- * @instr: instruction byte stream
- * @instrlen: length of the above
- * @off: offset within @instr where the first NOP has been detected
+ * The source offset is:
*
- * Return: number of NOPs found (and replaced).
+ * src_imm = target - src_next_ip (1)
+ *
+ * and the target offset is:
+ *
+ * dst_imm = target - dst_next_ip (2)
+ *
+ * so rework (1) as an expression for target like:
+ *
+ * target = src_imm + src_next_ip (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
+ *
+ * src_next_ip = src + ip_offset
+ * dst_next_ip = dst + ip_offset (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ * = src_imm + src - dst + ip_offset - ip_offset
+ * = src_imm + src - dst (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
*/
-static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
-{
- unsigned long flags;
- int i = off, nnops;
- while (i < instrlen) {
- if (instr[i] != 0x90)
- break;
+#define apply_reloc_n(n_, p_, d_) \
+ do { \
+ s32 v = *(s##n_ *)(p_); \
+ v += (d_); \
+ BUG_ON((v >> 31) != (v >> (n_-1))); \
+ *(s##n_ *)(p_) = (s##n_)v; \
+ } while (0)
+
- i++;
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+ switch (n) {
+ case 1: apply_reloc_n(8, ptr, diff); break;
+ case 2: apply_reloc_n(16, ptr, diff); break;
+ case 4: apply_reloc_n(32, ptr, diff); break;
+ default: BUG();
}
+}
- nnops = i - off;
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+ u8 *target = src + offset;
+ /*
+ * If the target is inside the patched block, it's relative to the
+ * block itself and does not need relocation.
+ */
+ return (target < src || target > src + src_len);
+}
- if (nnops <= 1)
- return nnops;
+static void __init_or_module noinline
+apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+{
+ int prev, target = 0;
- local_irq_save(flags);
- add_nops(instr + off, nnops);
- local_irq_restore(flags);
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;
- DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+ if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+ return;
- return nnops;
-}
+ next = i + insn.length;
-/*
- * "noinline" to cause control flow change and thus invalidate I$ and
- * cause refetch after modification.
- */
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
-{
- struct insn insn;
- int i = 0;
+ if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
+ continue;
- /*
- * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
- * ones.
- */
- for (;;) {
- if (insn_decode_kernel(&insn, &instr[i]))
- return;
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ if (insn.opcode.bytes[1] < 0x80 ||
+ insn.opcode.bytes[1] > 0x8f)
+ break;
- /*
- * See if this and any potentially following NOPs can be
- * optimized.
- */
- if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, len, i);
- else
- i += insn.length;
+ fallthrough; /* Jcc.d32 */
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case CALL_INSN_OPCODE:
+ if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ apply_reloc(insn.immediate.nbytes,
+ buf + i + insn_offset_immediate(&insn),
+ src - dest);
+ }
- if (i >= len)
- return;
+ /*
+ * Where possible, convert JMP.d32 into JMP.d8.
+ */
+ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ s32 imm = insn.immediate.value;
+ imm += src - dest;
+ imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+ if ((imm >> 31) == (imm >> 7)) {
+ buf[i+0] = JMP8_INSN_OPCODE;
+ buf[i+1] = (s8)imm;
+
+ memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+ }
+ }
+ break;
+ }
+
+ if (insn_rip_relative(&insn)) {
+ if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ apply_reloc(insn.displacement.nbytes,
+ buf + i + insn_offset_displacement(&insn),
+ src - dest);
+ }
+ }
}
}
@@ -270,7 +402,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
- DPRINTK("alt table %px, -> %px", start, end);
+ DPRINTK(ALT, "alt table %px, -> %px", start, end);
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -294,47 +426,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* - feature not present but ALT_FLAG_NOT is set to mean,
* patch if feature is *NOT* present.
*/
- if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
- goto next;
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+ optimize_nops(instr, a->instrlen);
+ continue;
+ }
- DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+ DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
(a->flags & ALT_FLAG_NOT) ? "!" : "",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
replacement, a->replacementlen);
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
-
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
- /*
- * 0xe8 is a relative jump; fix the offset.
- *
- * Instruction length is checked before the opcode to avoid
- * accessing uninitialized bytes for zero-length replacements.
- */
- if (a->replacementlen == 5 && *insn_buff == 0xe8) {
- *(s32 *)(insn_buff + 1) += replacement - instr;
- DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
- *(s32 *)(insn_buff + 1),
- (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
- }
-
- if (a->replacementlen && is_jmp(replacement[0]))
- recompute_jump(a, instr, replacement, insn_buff);
-
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+ apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
- text_poke_early(instr, insn_buff, insn_buff_sz);
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
-next:
- optimize_nops(instr, a->instrlen);
+ text_poke_early(instr, insn_buff, insn_buff_sz);
}
}
@@ -555,15 +671,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
continue;
}
- DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+ DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
addr, addr, insn.length,
addr + insn.length + insn.immediate.value);
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
optimize_nops(bytes, len);
- DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
- DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
}
}
@@ -590,13 +706,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
int i = 0;
+ /* Patch the custom return thunks... */
if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
- if (x86_return_thunk == __x86_return_thunk)
- return -1;
-
i = JMP32_INSN_SIZE;
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
} else {
+ /* ... or patch them out if not needed. */
bytes[i++] = RET_INSN_OPCODE;
}
@@ -609,6 +724,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
+ /*
+ * Do not patch out the default return thunks if those needed are the
+ * ones generated by the compiler.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
+ (x86_return_thunk == __x86_return_thunk))
+ return;
+
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
struct insn insn;
@@ -630,14 +753,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
addr, dest, 5, addr))
continue;
- DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
+ DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
addr, addr, insn.length,
addr + insn.length + insn.immediate.value);
len = patch_return(addr, &insn, bytes);
if (len == insn.length) {
- DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
- DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
}
}
@@ -655,7 +778,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_endbr(void *addr, bool warn)
+static void __init_or_module poison_endbr(void *addr, bool warn)
{
u32 endbr, poison = gen_endbr_poison();
@@ -667,13 +790,13 @@ static void poison_endbr(void *addr, bool warn)
return;
}
- DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+ DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
/*
* When we have IBT, the lack of ENDBR will trigger #CP
*/
- DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
- DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
text_poke_early(addr, &poison, 4);
}
@@ -1148,7 +1271,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
@@ -1225,6 +1348,20 @@ int alternatives_text_reserved(void *start, void *end)
#endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, x86_nops[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *end)
{
@@ -1332,6 +1469,35 @@ static noinline void __init int3_selftest(void)
unregister_die_notifier(&int3_exception_nb);
}
+static __initdata int __alt_reloc_selftest_addr;
+
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+ WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+ /*
+ * Tests apply_relocation().
+ *
+ * This has a relative immediate (CALL) in a place other than the first
+ * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ *
+ * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
+ * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
+ *
+ * Getting this wrong will either crash and burn or tickle the WARN
+ * above.
+ */
+ asm_inline volatile (
+ ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ : /* output */
+ : [mem] "m" (__alt_reloc_selftest_addr)
+ : _ASM_ARG1
+ );
+}
+
void __init alternative_instructions(void)
{
int3_selftest();
@@ -1419,6 +1585,8 @@ void __init alternative_instructions(void)
restart_nmi();
alternatives_patched = 1;
+
+ alt_reloc_selftest();
}
/**
@@ -1954,6 +2122,16 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
atomic_set_release(&bp_desc.refs, 1);
/*
+ * Function tracing can enable thousands of places that need to be
+ * updated. This can take quite some time, and with full kernel debugging
+ * enabled, this could cause the softlockup watchdog to trigger.
+ * This function gets called every 256 entries added to be patched.
+ * Call cond_resched() here to make sure that other tasks can get scheduled
+ * while processing all the functions being patched.
+ */
+ cond_resched();
+
+ /*
* Corresponding read barrier in int3 notifier for making sure the
* nr_entries and handler are correctly ordered wrt. patching.
*/
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 7e331e8f3692..035a3db5330b 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,28 +15,31 @@
#include <linux/pci_ids.h>
#include <asm/amd_nb.h>
-#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
-#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
-#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
-#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
-#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
-#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
-#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
-#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
-#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
-#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
-#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
-#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
-#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
-#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
-#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
-#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
-#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
-#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
-#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
-#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
-#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
-#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
+#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
+#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
+#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
+#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
+#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
+#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
+#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
+#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
+
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
+#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
+#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
+#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
+#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
+#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
+#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
+#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
+#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
@@ -53,6 +56,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
{}
};
@@ -81,6 +85,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{}
};
@@ -101,6 +106,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
{}
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 770557110051..af49e24b46a4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -101,6 +101,9 @@ static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
*/
static bool virt_ext_dest_id __ro_after_init;
+/* For parallel bootup. */
+unsigned long apic_mmio_base __ro_after_init;
+
/*
* Map cpu index to physical APIC ID
*/
@@ -2163,6 +2166,7 @@ void __init register_lapic_address(unsigned long address)
if (!x2apic_mode) {
set_fixmap_nocache(FIX_APIC_BASE, address);
+ apic_mmio_base = APIC_BASE;
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
APIC_BASE, address);
}
@@ -2376,7 +2380,7 @@ static int nr_logical_cpuids = 1;
/*
* Used to store mapping between logical CPU IDs and APIC IDs.
*/
-static int cpuid_to_apicid[] = {
+int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
@@ -2386,20 +2390,31 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
}
#ifdef CONFIG_SMP
-/**
- * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
- * @apicid: APIC ID to check
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ /* Isolate the SMT bit(s) in the APICID and check for 0 */
+ u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
+
+ if (smp_num_siblings == 1 || !(apicid & mask))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
+}
+
+/*
+ * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid
+ * during early boot. Initialize the primary thread mask before SMP
+ * bringup.
*/
-bool apic_id_is_primary_thread(unsigned int apicid)
+static int __init smp_init_primary_thread_mask(void)
{
- u32 mask;
+ unsigned int cpu;
- if (smp_num_siblings == 1)
- return true;
- /* Isolate the SMT bit(s) in the APICID and check for 0 */
- mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
- return !(apicid & mask);
+ for (cpu = 0; cpu < nr_logical_cpuids; cpu++)
+ cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]);
+ return 0;
}
+early_initcall(smp_init_primary_thread_mask);
+#else
+static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
#endif
/*
@@ -2544,6 +2559,9 @@ int generic_processor_info(int apicid, int version)
set_cpu_present(cpu, true);
num_processors++;
+ if (system_state != SYSTEM_BOOTING)
+ cpu_mark_primary_thread(cpu, apicid);
+
return cpu;
}
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6bde05a86b4e..896bc41cb2ba 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -97,7 +97,10 @@ static void init_x2apic_ldr(void)
static int x2apic_phys_probe(void)
{
- if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
+ if (!x2apic_mode)
+ return 0;
+
+ if (x2apic_phys || x2apic_fadt_phys())
return 1;
return apic == &apic_x2apic_phys;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 482855227964..d9384d5b4b8e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -546,7 +546,6 @@ unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
/* The following values are used for the per node hub info struct */
-static __initdata unsigned short *_node_to_pnode;
static __initdata unsigned short _min_socket, _max_socket;
static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
static __initdata struct uv_gam_range_entry *uv_gre_table;
@@ -554,6 +553,7 @@ static __initdata struct uv_gam_parameters *uv_gp_table;
static __initdata unsigned short *_socket_to_node;
static __initdata unsigned short *_socket_to_pnode;
static __initdata unsigned short *_pnode_to_socket;
+static __initdata unsigned short *_node_to_socket;
static __initdata struct uv_gam_range_s *_gr_table;
@@ -617,7 +617,8 @@ static __init void build_uv_gr_table(void)
bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
grt = kzalloc(bytes, GFP_KERNEL);
- BUG_ON(!grt);
+ if (WARN_ON_ONCE(!grt))
+ return;
_gr_table = grt;
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
@@ -1022,7 +1023,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
switch (index) {
case UVY_MMIOH0:
mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0;
- nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
min_nasid = min_pnode;
max_nasid = max_pnode;
@@ -1030,7 +1031,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
break;
case UVY_MMIOH1:
mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1;
- nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
min_nasid = min_pnode;
max_nasid = max_pnode;
@@ -1038,7 +1039,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
break;
case UVX_MMIOH0:
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0;
- nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
min_nasid = min_pnode * 2;
max_nasid = max_pnode * 2;
@@ -1046,7 +1047,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
break;
case UVX_MMIOH1:
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1;
- nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
min_nasid = min_pnode * 2;
max_nasid = max_pnode * 2;
@@ -1072,8 +1073,9 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
/* Invalid NASID check */
if (nasid < min_nasid || max_nasid < nasid) {
- pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n",
- __func__, index, min_nasid, max_nasid);
+ /* Not an error: unused table entries get "poison" values */
+ pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n",
+ __func__, index, nasid, min_nasid, max_nasid);
nasid = -1;
}
@@ -1292,6 +1294,7 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
hi->nasid_shift = uv_cpuid.nasid_shift;
hi->min_pnode = _min_pnode;
hi->min_socket = _min_socket;
+ hi->node_to_socket = _node_to_socket;
hi->pnode_to_socket = _pnode_to_socket;
hi->socket_to_node = _socket_to_node;
hi->socket_to_pnode = _socket_to_pnode;
@@ -1348,7 +1351,7 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
unsigned long lgre = 0, gend = 0;
int index = 0;
- int sock_min = 999999, pnode_min = 99999;
+ int sock_min = INT_MAX, pnode_min = INT_MAX;
int sock_max = -1, pnode_max = -1;
uv_gre_table = gre;
@@ -1459,11 +1462,37 @@ static int __init decode_uv_systab(void)
return 0;
}
+/*
+ * Given a bitmask 'bits' representing presnt blades, numbered
+ * starting at 'base', masking off unused high bits of blade number
+ * with 'mask', update the minimum and maximum blade numbers that we
+ * have found. (Masking with 'mask' necessary because of BIOS
+ * treatment of system partitioning when creating this table we are
+ * interpreting.)
+ */
+static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max)
+{
+ int first, last;
+
+ if (!bits)
+ return;
+ first = (base + __ffs(bits)) & mask;
+ last = (base + __fls(bits)) & mask;
+
+ if (*min > first)
+ *min = first;
+ if (*max < last)
+ *max = last;
+}
+
/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */
static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
{
unsigned long np;
int i, uv_pb = 0;
+ int sock_min = INT_MAX, sock_max = -1, s_mask;
+
+ s_mask = (1 << uv_cpuid.n_skt) - 1;
if (UVH_NODE_PRESENT_TABLE) {
pr_info("UV: NODE_PRESENT_DEPTH = %d\n",
@@ -1471,35 +1500,82 @@ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
- uv_pb += hweight64(np);
+ blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max);
}
}
if (UVH_NODE_PRESENT_0) {
np = uv_read_local_mmr(UVH_NODE_PRESENT_0);
pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np);
- uv_pb += hweight64(np);
+ blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max);
}
if (UVH_NODE_PRESENT_1) {
np = uv_read_local_mmr(UVH_NODE_PRESENT_1);
pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np);
- uv_pb += hweight64(np);
+ blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max);
+ }
+
+ /* Only update if we actually found some bits indicating blades present */
+ if (sock_max >= sock_min) {
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ uv_pb = sock_max - sock_min + 1;
}
if (uv_possible_blades != uv_pb)
uv_possible_blades = uv_pb;
- pr_info("UV: number nodes/possible blades %d\n", uv_pb);
+ pr_info("UV: number nodes/possible blades %d (%d - %d)\n",
+ uv_pb, sock_min, sock_max);
+}
+
+static int __init alloc_conv_table(int num_elem, unsigned short **table)
+{
+ int i;
+ size_t bytes;
+
+ bytes = num_elem * sizeof(*table[0]);
+ *table = kmalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!*table))
+ return -ENOMEM;
+ for (i = 0; i < num_elem; i++)
+ ((unsigned short *)*table)[i] = SOCK_EMPTY;
+ return 0;
}
+/* Remove conversion table if it's 1:1 */
+#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2)
+
+static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2)
+{
+ int i;
+ unsigned short *table = *tp;
+
+ if (table == NULL)
+ return;
+ if (max != max2)
+ return;
+ for (i = 0; i < max; i++) {
+ if (i != table[i])
+ return;
+ }
+ kfree(table);
+ *tp = NULL;
+ pr_info("UV: %s is 1:1, conversion table removed\n", tname);
+}
+
+/*
+ * Build Socket Tables
+ * If the number of nodes is >1 per socket, socket to node table will
+ * contain lowest node number on that socket.
+ */
static void __init build_socket_tables(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
- int num, nump;
+ int nums, numn, nump;
int cpu, i, lnid;
int minsock = _min_socket;
int maxsock = _max_socket;
int minpnode = _min_pnode;
int maxpnode = _max_pnode;
- size_t bytes;
if (!gre) {
if (is_uv2_hub() || is_uv3_hub()) {
@@ -1507,39 +1583,36 @@ static void __init build_socket_tables(void)
return;
}
pr_err("UV: Error: UVsystab address translations not available!\n");
- BUG();
+ WARN_ON_ONCE(!gre);
+ return;
}
- /* Build socket id -> node id, pnode */
- num = maxsock - minsock + 1;
- bytes = num * sizeof(_socket_to_node[0]);
- _socket_to_node = kmalloc(bytes, GFP_KERNEL);
- _socket_to_pnode = kmalloc(bytes, GFP_KERNEL);
-
+ numn = num_possible_nodes();
nump = maxpnode - minpnode + 1;
- bytes = nump * sizeof(_pnode_to_socket[0]);
- _pnode_to_socket = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket);
-
- for (i = 0; i < num; i++)
- _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY;
-
- for (i = 0; i < nump; i++)
- _pnode_to_socket[i] = SOCK_EMPTY;
+ nums = maxsock - minsock + 1;
+
+ /* Allocate and clear tables */
+ if ((alloc_conv_table(nump, &_pnode_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_pnode) < 0)
+ || (alloc_conv_table(numn, &_node_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_node) < 0)) {
+ kfree(_pnode_to_socket);
+ kfree(_socket_to_pnode);
+ kfree(_node_to_socket);
+ return;
+ }
/* Fill in pnode/node/addr conversion list values: */
- pr_info("UV: GAM Building socket/pnode conversion tables\n");
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
continue;
i = gre->sockid - minsock;
- /* Duplicate: */
- if (_socket_to_pnode[i] != SOCK_EMPTY)
- continue;
- _socket_to_pnode[i] = gre->pnode;
+ if (_socket_to_pnode[i] == SOCK_EMPTY)
+ _socket_to_pnode[i] = gre->pnode;
i = gre->pnode - minpnode;
- _pnode_to_socket[i] = gre->sockid;
+ if (_pnode_to_socket[i] == SOCK_EMPTY)
+ _pnode_to_socket[i] = gre->sockid;
pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
gre->sockid, gre->type, gre->nasid,
@@ -1549,66 +1622,39 @@ static void __init build_socket_tables(void)
/* Set socket -> node values: */
lnid = NUMA_NO_NODE;
- for_each_present_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
int nid = cpu_to_node(cpu);
int apicid, sockid;
if (lnid == nid)
continue;
lnid = nid;
+
apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
- _socket_to_node[sockid - minsock] = nid;
- pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
- sockid, apicid, nid);
- }
- /* Set up physical blade to pnode translation from GAM Range Table: */
- bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
- _node_to_pnode = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!_node_to_pnode);
+ if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
+ _socket_to_node[sockid - minsock] = nid;
- for (lnid = 0; lnid < num_possible_nodes(); lnid++) {
- unsigned short sockid;
+ if (_node_to_socket[nid] == SOCK_EMPTY)
+ _node_to_socket[nid] = sockid;
- for (sockid = minsock; sockid <= maxsock; sockid++) {
- if (lnid == _socket_to_node[sockid - minsock]) {
- _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
- break;
- }
- }
- if (sockid > maxsock) {
- pr_err("UV: socket for node %d not found!\n", lnid);
- BUG();
- }
+ pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n",
+ sockid,
+ apicid,
+ _node_to_socket[nid],
+ nid,
+ _socket_to_node[sockid - minsock]);
}
/*
- * If socket id == pnode or socket id == node for all nodes,
+ * If e.g. socket id == pnode for all pnodes,
* system runs faster by removing corresponding conversion table.
*/
- pr_info("UV: Checking socket->node/pnode for identity maps\n");
- if (minsock == 0) {
- for (i = 0; i < num; i++)
- if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
- break;
- if (i >= num) {
- kfree(_socket_to_node);
- _socket_to_node = NULL;
- pr_info("UV: 1:1 socket_to_node table removed\n");
- }
- }
- if (minsock == minpnode) {
- for (i = 0; i < num; i++)
- if (_socket_to_pnode[i] != SOCK_EMPTY &&
- _socket_to_pnode[i] != i + minpnode)
- break;
- if (i >= num) {
- kfree(_socket_to_pnode);
- _socket_to_pnode = NULL;
- pr_info("UV: 1:1 socket_to_pnode table removed\n");
- }
- }
+ FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump);
+ FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump);
}
/* Check which reboot to use */
@@ -1692,12 +1738,13 @@ static __init int uv_system_init_hubless(void)
static void __init uv_system_init_hub(void)
{
struct uv_hub_info_s hub_info = {0};
- int bytes, cpu, nodeid;
- unsigned short min_pnode = 9999, max_pnode = 0;
+ int bytes, cpu, nodeid, bid;
+ unsigned short min_pnode = USHRT_MAX, max_pnode = 0;
char *hub = is_uv5_hub() ? "UV500" :
is_uv4_hub() ? "UV400" :
is_uv3_hub() ? "UV300" :
is_uv2_hub() ? "UV2000/3000" : NULL;
+ struct uv_hub_info_s **uv_hub_info_list_blade;
if (!hub) {
pr_err("UV: Unknown/unsupported UV hub\n");
@@ -1720,9 +1767,12 @@ static void __init uv_system_init_hub(void)
build_uv_gr_table();
set_block_size();
uv_init_hub_info(&hub_info);
- uv_possible_blades = num_possible_nodes();
- if (!_node_to_pnode)
+ /* If UV2 or UV3 may need to get # blades from HW */
+ if (is_uv(UV2|UV3) && !uv_gre_table)
boot_init_possible_blades(&hub_info);
+ else
+ /* min/max sockets set in decode_gam_rng_tbl */
+ uv_possible_blades = (_max_socket - _min_socket) + 1;
/* uv_num_possible_blades() is really the hub count: */
pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
@@ -1731,79 +1781,98 @@ static void __init uv_system_init_hub(void)
hub_info.coherency_domain_number = sn_coherency_id;
uv_rtc_init();
+ /*
+ * __uv_hub_info_list[] is indexed by node, but there is only
+ * one hub_info structure per blade. First, allocate one
+ * structure per blade. Further down we create a per-node
+ * table (__uv_hub_info_list[]) pointing to hub_info
+ * structures for the correct blade.
+ */
+
bytes = sizeof(void *) * uv_num_possible_blades();
- __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
- BUG_ON(!__uv_hub_info_list);
+ uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!uv_hub_info_list_blade))
+ return;
bytes = sizeof(struct uv_hub_info_s);
- for_each_node(nodeid) {
+ for_each_possible_blade(bid) {
struct uv_hub_info_s *new_hub;
- if (__uv_hub_info_list[nodeid]) {
- pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
- BUG();
+ /* Allocate & fill new per hub info list */
+ new_hub = (bid == 0) ? &uv_hub_info_node0
+ : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid));
+ if (WARN_ON_ONCE(!new_hub)) {
+ /* do not kfree() bid 0, which is statically allocated */
+ while (--bid > 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
}
- /* Allocate new per hub info list */
- new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
- BUG_ON(!new_hub);
- __uv_hub_info_list[nodeid] = new_hub;
- new_hub = uv_hub_info_list(nodeid);
- BUG_ON(!new_hub);
+ uv_hub_info_list_blade[bid] = new_hub;
*new_hub = hub_info;
/* Use information from GAM table if available: */
- if (_node_to_pnode)
- new_hub->pnode = _node_to_pnode[nodeid];
+ if (uv_gre_table)
+ new_hub->pnode = uv_blade_to_pnode(bid);
else /* Or fill in during CPU loop: */
new_hub->pnode = 0xffff;
- new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
+ new_hub->numa_blade_id = bid;
new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
+ /*
+ * Now populate __uv_hub_info_list[] for each node with the
+ * pointer to the struct for the blade it resides on.
+ */
+
+ bytes = sizeof(void *) * num_possible_nodes();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!__uv_hub_info_list)) {
+ for_each_possible_blade(bid)
+ /* bid 0 is statically allocated */
+ if (bid != 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
+ }
+
+ for_each_node(nodeid)
+ __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)];
+
/* Initialize per CPU info: */
for_each_possible_cpu(cpu) {
- int apicid = per_cpu(x86_cpu_to_apicid, cpu);
- int numa_node_id;
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned short bid;
unsigned short pnode;
- nodeid = cpu_to_node(cpu);
- numa_node_id = numa_cpu_node(cpu);
pnode = uv_apicid_to_pnode(apicid);
+ bid = uv_pnode_to_socket(pnode) - _min_socket;
- uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid];
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
- /* Init memoryless node: */
- if (nodeid != numa_node_id &&
- uv_hub_info_list(numa_node_id)->pnode == 0xffff)
- uv_hub_info_list(numa_node_id)->pnode = pnode;
- else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
uv_cpu_hub_info(cpu)->pnode = pnode;
}
- for_each_node(nodeid) {
- unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
+ for_each_possible_blade(bid) {
+ unsigned short pnode = uv_hub_info_list_blade[bid]->pnode;
- /* Add pnode info for pre-GAM list nodes without CPUs: */
- if (pnode == 0xffff) {
- unsigned long paddr;
+ if (pnode == 0xffff)
+ continue;
- paddr = node_start_pfn(nodeid) << PAGE_SHIFT;
- pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
- uv_hub_info_list(nodeid)->pnode = pnode;
- }
min_pnode = min(pnode, min_pnode);
max_pnode = max(pnode, max_pnode);
- pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n",
- nodeid,
- uv_hub_info_list(nodeid)->pnode,
- uv_hub_info_list(nodeid)->nr_possible_cpus);
+ pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n",
+ bid,
+ uv_hub_info_list_blade[bid]->pnode,
+ uv_hub_info_list_blade[bid]->nr_possible_cpus);
}
pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
@@ -1811,6 +1880,9 @@ static void __init uv_system_init_hub(void)
map_mmr_high(max_pnode);
map_mmioh_high(min_pnode, max_pnode);
+ kfree(uv_hub_info_list_blade);
+ uv_hub_info_list_blade = NULL;
+
uv_nmi_setup();
uv_cpu_init();
uv_setup_proc_files(0);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 22ab13966427..8bb937331acb 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -133,8 +133,8 @@ static bool skip_addr(void *dest)
/* Accounts directly */
if (dest == ret_from_fork)
return true;
-#ifdef CONFIG_HOTPLUG_CPU
- if (dest == start_cpu0)
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
+ if (dest == soft_restart_cpu)
return true;
#endif
#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7e3ceaf75c1..4350f6bfc064 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,7 +27,7 @@ obj-y += cpuid-deps.o
obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
-obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-y += capflags.o powerflags.o
obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
@@ -54,7 +54,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
obj-$(CONFIG_ACRN_GUEST) += acrn.o
-ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
@@ -63,5 +62,4 @@ vmxfeature = $(src)/../../include/asm/vmxfeatures.h
$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
-endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 182af64387d0..9e2a91830f72 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -9,7 +9,6 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/nospec.h>
@@ -27,8 +26,6 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
-#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
@@ -125,21 +122,8 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
-void __init check_bugs(void)
+void __init cpu_select_mitigations(void)
{
- identify_boot_cpu();
-
- /*
- * identify_boot_cpu() initialized SMT support information, let the
- * core code know.
- */
- cpu_smt_check_topology();
-
- if (!IS_ENABLED(CONFIG_SMP)) {
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
- }
-
/*
* Read the SPEC_CTRL MSR to account for reserved bits which may
* have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
@@ -176,39 +160,6 @@ void __init check_bugs(void)
md_clear_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
-
- arch_smt_update();
-
-#ifdef CONFIG_X86_32
- /*
- * Check whether we are able to run this kernel safely on SMP.
- *
- * - i386 is no longer supported.
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
- */
- if (boot_cpu_data.x86 < 4)
- panic("Kernel requires i486+ for 'invlpg' and other features");
-
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
-
- fpu__init_check_bugs();
-#else /* CONFIG_X86_64 */
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-#endif
}
/*
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 4063e8991211..8f86eacf69f7 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -39,6 +39,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Shared L2 cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+static cpumask_var_t cpu_cacheinfo_mask;
+
/* Kernel controls MTRR and/or PAT MSRs. */
unsigned int memory_caching_control __ro_after_init;
@@ -1172,8 +1174,10 @@ void cache_bp_restore(void)
cache_cpu_init();
}
-static int cache_ap_init(unsigned int cpu)
+static int cache_ap_online(unsigned int cpu)
{
+ cpumask_set_cpu(cpu, cpu_cacheinfo_mask);
+
if (!memory_caching_control || get_cache_aps_delayed_init())
return 0;
@@ -1191,11 +1195,17 @@ static int cache_ap_init(unsigned int cpu)
* lock to prevent MTRR entry changes
*/
stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
- cpu_callout_mask);
+ cpu_cacheinfo_mask);
return 0;
}
+static int cache_ap_offline(unsigned int cpu)
+{
+ cpumask_clear_cpu(cpu, cpu_cacheinfo_mask);
+ return 0;
+}
+
/*
* Delayed cache initialization for all AP's
*/
@@ -1210,9 +1220,12 @@ void cache_aps_init(void)
static int __init cache_ap_register(void)
{
+ zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL);
+ cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask);
+
cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
"x86/cachectrl:starting",
- cache_ap_init, NULL);
+ cache_ap_online, cache_ap_offline);
return 0;
}
-core_initcall(cache_ap_register);
+early_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80710a68ef7d..52683fddafaf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,12 +18,16 @@
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
#include <linux/stackprotector.h>
+#include <linux/utsname.h>
+#include <asm/alternative.h>
#include <asm/cmdline.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
@@ -59,7 +63,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
-#include <asm/sigframe.h>
+#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
@@ -67,14 +71,6 @@
u32 elf_hwcap2 __read_mostly;
-/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_initialized_mask;
-cpumask_var_t cpu_callout_mask;
-cpumask_var_t cpu_callin_mask;
-
-/* representing cpus for which sibling maps can be computed */
-cpumask_var_t cpu_sibling_setup_mask;
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
@@ -169,15 +165,6 @@ clear_ppin:
clear_cpu_cap(c, info->feature);
}
-/* correctly size the local cpu masks */
-void __init setup_cpu_local_masks(void)
-{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
-}
-
static void default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
@@ -1502,12 +1489,10 @@ static void __init cpu_parse_early_param(void)
if (!kstrtouint(opt, 10, &bit)) {
if (bit < NCAPINTS * 32) {
-#ifdef CONFIG_X86_FEATURE_NAMES
/* empty-string, i.e., ""-defined feature flags */
if (!x86_cap_flags[bit])
pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
else
-#endif
pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
setup_clear_cpu_cap(bit);
@@ -1520,7 +1505,6 @@ static void __init cpu_parse_early_param(void)
continue;
}
-#ifdef CONFIG_X86_FEATURE_NAMES
for (bit = 0; bit < 32 * NCAPINTS; bit++) {
if (!x86_cap_flag(bit))
continue;
@@ -1537,7 +1521,6 @@ static void __init cpu_parse_early_param(void)
if (!found)
pr_cont(" (unknown: %s)", opt);
-#endif
}
pr_cont("\n");
@@ -1600,10 +1583,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
sld_setup(c);
- fpu__init_system(c);
-
- init_sigframe_size();
-
#ifdef CONFIG_X86_32
/*
* Regardless of whether PCID is enumerated, the SDM says
@@ -2123,19 +2102,6 @@ static void dbg_restore_debug_regs(void)
#define dbg_restore_debug_regs()
#endif /* ! CONFIG_KGDB */
-static void wait_for_master_cpu(int cpu)
-{
-#ifdef CONFIG_SMP
- /*
- * wait for ACK from master CPU before continuing
- * with AP initialization
- */
- WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
- while (!cpumask_test_cpu(cpu, cpu_callout_mask))
- cpu_relax();
-#endif
-}
-
static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
@@ -2158,11 +2124,7 @@ static inline void setup_getcpu(int cpu)
}
#ifdef CONFIG_X86_64
-static inline void ucode_cpu_init(int cpu)
-{
- if (cpu)
- load_ucode_ap();
-}
+static inline void ucode_cpu_init(int cpu) { }
static inline void tss_setup_ist(struct tss_struct *tss)
{
@@ -2239,8 +2201,6 @@ void cpu_init(void)
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- wait_for_master_cpu(cpu);
-
ucode_cpu_init(cpu);
#ifdef CONFIG_NUMA
@@ -2285,26 +2245,12 @@ void cpu_init(void)
doublefault_init_cpu_tss();
- fpu__init_cpu();
-
if (is_uv_system())
uv_cpu_init();
load_fixmap_gdt(cpu);
}
-#ifdef CONFIG_SMP
-void cpu_init_secondary(void)
-{
- /*
- * Relies on the BP having set-up the IDT tables, which are loaded
- * on this CPU in cpu_init_exception_handling().
- */
- cpu_init_exception_handling();
- cpu_init();
-}
-#endif
-
#ifdef CONFIG_MICROCODE_LATE_LOADING
/**
* store_cpu_caps() - Store a snapshot of CPU capabilities
@@ -2362,3 +2308,69 @@ void arch_smt_update(void)
/* Check whether IPI broadcasting can be enabled */
apic_smt_update();
}
+
+void __init arch_cpu_finalize_init(void)
+{
+ identify_boot_cpu();
+
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology();
+
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+ cpu_select_mitigations();
+
+ arch_smt_update();
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
+
+ /*
+ * Must be before alternatives because it might set or clear
+ * feature bits.
+ */
+ fpu__init_system();
+ fpu__init_cpu();
+
+ alternative_instructions();
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f97b0fe13da8..1c44630d4789 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -79,6 +79,7 @@ extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 0b971f974096..5e74610b39e7 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -715,11 +715,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
bool amd_mce_is_memory_error(struct mce *m)
{
+ enum smca_bank_types bank_type;
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
if (mce_flags.smca)
- return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
+ return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
@@ -1050,7 +1052,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
- if (b && bank_type == SMCA_UMC) {
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
return NULL;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2eec60f50057..22dfcb2adcd7 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1533,7 +1533,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (kill_current_task)
+ if (!mce_usable_address(&m))
queue_task_work(&m, msg, kill_me_now);
else
queue_task_work(&m, msg, kill_me_maybe);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6ad33f355861..725344048f85 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -726,11 +726,15 @@ unlock:
static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
{
struct task_struct *p, *t;
+ pid_t pid;
rcu_read_lock();
for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r))
- seq_printf(s, "%d\n", t->pid);
+ if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+ pid = task_pid_vnr(t);
+ if (pid)
+ seq_printf(s, "%d\n", pid);
+ }
}
rcu_read_unlock();
}
@@ -2301,6 +2305,26 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
}
}
+static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+}
+
+static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+ kernfs_unbreak_active_protection(kn);
+ rdtgroup_remove(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
{
struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
@@ -2308,8 +2332,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
if (!rdtgrp)
return NULL;
- atomic_inc(&rdtgrp->waitcount);
- kernfs_break_active_protection(kn);
+ rdtgroup_kn_get(rdtgrp, kn);
mutex_lock(&rdtgroup_mutex);
@@ -2328,17 +2351,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
return;
mutex_unlock(&rdtgroup_mutex);
-
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
- kernfs_unbreak_active_protection(kn);
- rdtgroup_remove(rdtgrp);
- } else {
- kernfs_unbreak_active_protection(kn);
- }
+ rdtgroup_kn_put(rdtgrp, kn);
}
static int mkdir_mondata_all(struct kernfs_node *parent_kn,
@@ -3505,6 +3518,133 @@ out:
return ret;
}
+/**
+ * mongrp_reparent() - replace parent CTRL_MON group of a MON group
+ * @rdtgrp: the MON group whose parent should be replaced
+ * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
+ * @cpus: cpumask provided by the caller for use during this call
+ *
+ * Replaces the parent CTRL_MON group for a MON group, resulting in all member
+ * tasks' CLOSID immediately changing to that of the new parent group.
+ * Monitoring data for the group is unaffected by this operation.
+ */
+static void mongrp_reparent(struct rdtgroup *rdtgrp,
+ struct rdtgroup *new_prdtgrp,
+ cpumask_var_t cpus)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+
+ WARN_ON(rdtgrp->type != RDTMON_GROUP);
+ WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
+
+ /* Nothing to do when simply renaming a MON group. */
+ if (prdtgrp == new_prdtgrp)
+ return;
+
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_move_tail(&rdtgrp->mon.crdtgrp_list,
+ &new_prdtgrp->mon.crdtgrp_list);
+
+ rdtgrp->mon.parent = new_prdtgrp;
+ rdtgrp->closid = new_prdtgrp->closid;
+
+ /* Propagate updated closid to all tasks in this group. */
+ rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
+
+ update_closid_rmid(cpus, NULL);
+}
+
+static int rdtgroup_rename(struct kernfs_node *kn,
+ struct kernfs_node *new_parent, const char *new_name)
+{
+ struct rdtgroup *new_prdtgrp;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret;
+
+ rdtgrp = kernfs_to_rdtgroup(kn);
+ new_prdtgrp = kernfs_to_rdtgroup(new_parent);
+ if (!rdtgrp || !new_prdtgrp)
+ return -ENOENT;
+
+ /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
+ rdtgroup_kn_get(rdtgrp, kn);
+ rdtgroup_kn_get(new_prdtgrp, new_parent);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ /*
+ * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
+ * either kernfs_node is a file.
+ */
+ if (kernfs_type(kn) != KERNFS_DIR ||
+ kernfs_type(new_parent) != KERNFS_DIR) {
+ rdt_last_cmd_puts("Source and destination must be directories");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
+ !is_mon_groups(kn->parent, kn->name)) {
+ rdt_last_cmd_puts("Source must be a MON group\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (!is_mon_groups(new_parent, new_name)) {
+ rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the MON group is monitoring CPUs, the CPUs must be assigned to the
+ * current parent CTRL_MON group and therefore cannot be assigned to
+ * the new parent, making the move illegal.
+ */
+ if (!cpumask_empty(&rdtgrp->cpu_mask) &&
+ rdtgrp->mon.parent != new_prdtgrp) {
+ rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * Allocate the cpumask for use in mongrp_reparent() to avoid the
+ * possibility of failing to allocate it after kernfs_rename() has
+ * succeeded.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Perform all input validation and allocations needed to ensure
+ * mongrp_reparent() will succeed before calling kernfs_rename(),
+ * otherwise it would be necessary to revert this call if
+ * mongrp_reparent() failed.
+ */
+ ret = kernfs_rename(kn, new_parent, new_name);
+ if (!ret)
+ mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
+
+ free_cpumask_var(tmpmask);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ rdtgroup_kn_put(rdtgrp, kn);
+ rdtgroup_kn_put(new_prdtgrp, new_parent);
+ return ret;
+}
+
static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
@@ -3522,6 +3662,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
.mkdir = rdtgroup_mkdir,
.rmdir = rdtgroup_rmdir,
+ .rename = rdtgroup_rename,
.show_options = rdtgroup_show_options,
};
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 5e868b62a7c4..0270925fe013 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
- smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
#endif
return 0;
}
@@ -109,7 +109,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
*/
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
c->initial_apicid = edx;
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0bf6779187dd..f18ca44c904b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -195,7 +195,6 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
- stack = stack ? : get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -214,9 +213,13 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for (stack = stack ?: get_stack_pointer(task, regs);
+ stack;
+ stack = stack_info.next_sp) {
const char *stack_name;
+ stack = PTR_ALIGN(stack, sizeof(long));
+
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* We weren't on a valid stack. It's possible that
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index 9fcfa5c4dad7..af5cbdd9bd29 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void)
struct fpu *fpu = &current->thread.fpu;
int cpu = smp_processor_id();
- if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER)))
+ if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
return;
if (!fpregs_state_valid(fpu, cpu)) {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index caf33486dc5e..1015af1ae562 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
this_cpu_write(in_kernel_fpu, true);
- if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
save_fpregs_to_fpstate(&current->thread.fpu);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 851eb13edc01..998a08f17e33 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -53,7 +53,7 @@ void fpu__init_cpu(void)
fpu__init_cpu_xstate();
}
-static bool fpu__probe_without_cpuid(void)
+static bool __init fpu__probe_without_cpuid(void)
{
unsigned long cr0;
u16 fsw, fcw;
@@ -71,7 +71,7 @@ static bool fpu__probe_without_cpuid(void)
return fsw == 0 && (fcw & 0x103f) == 0x003f;
}
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static void __init fpu__init_system_early_generic(void)
{
if (!boot_cpu_has(X86_FEATURE_CPUID) &&
!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
@@ -211,10 +211,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
-void __init fpu__init_system(struct cpuinfo_x86 *c)
+void __init fpu__init_system(void)
{
fpstate_reset(&current->thread.fpu);
- fpu__init_system_early_generic(c);
+ fpu__init_system_early_generic();
/*
* The FPU has to be operational for some of the
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 67c8ed99144b..c9318993f959 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -138,20 +138,6 @@ SYM_CODE_START(startup_32)
jmp .Ldefault_entry
SYM_CODE_END(startup_32)
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
- * up already except stack. We just set up stack here. Then call
- * start_secondary().
- */
-SYM_FUNC_START(start_cpu0)
- movl initial_stack, %ecx
- movl %ecx, %esp
- call *(initial_code)
-1: jmp 1b
-SYM_FUNC_END(start_cpu0)
-#endif
-
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a5df3e994f04..c5b9289837dc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -24,7 +24,9 @@
#include "../entry/calling.h"
#include <asm/export.h>
#include <asm/nospec-branch.h>
+#include <asm/apicdef.h>
#include <asm/fixmap.h>
+#include <asm/smp.h>
/*
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -77,6 +79,15 @@ SYM_CODE_START_NOALIGN(startup_64)
call startup_64_setup_env
popq %rsi
+ /* Now switch to __KERNEL_CS so IRET works reliably */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
+
+.Lon_kernel_cs:
+ UNWIND_HINT_END_OF_STACK
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Activate SEV/SME memory encryption if supported/enabled. This needs to
@@ -90,15 +101,6 @@ SYM_CODE_START_NOALIGN(startup_64)
popq %rsi
#endif
- /* Now switch to __KERNEL_CS so IRET works reliably */
- pushq $__KERNEL_CS
- leaq .Lon_kernel_cs(%rip), %rax
- pushq %rax
- lretq
-
-.Lon_kernel_cs:
- UNWIND_HINT_END_OF_STACK
-
/* Sanitize CPU configuration */
call verify_cpu
@@ -234,8 +236,67 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_NOENDBR // above
#ifdef CONFIG_SMP
+ /*
+ * For parallel boot, the APIC ID is read from the APIC, and then
+ * used to look up the CPU number. For booting a single CPU, the
+ * CPU number is encoded in smpboot_control.
+ *
+ * Bit 31 STARTUP_READ_APICID (Read APICID from APIC)
+ * Bit 0-23 CPU# if STARTUP_xx flags are not set
+ */
movl smpboot_control(%rip), %ecx
+ testl $STARTUP_READ_APICID, %ecx
+ jnz .Lread_apicid
+ /*
+ * No control bit set, single CPU bringup. CPU number is provided
+ * in bit 0-23. This is also the boot CPU case (CPU number 0).
+ */
+ andl $(~STARTUP_PARALLEL_MASK), %ecx
+ jmp .Lsetup_cpu
+.Lread_apicid:
+ /* Check whether X2APIC mode is already enabled */
+ mov $MSR_IA32_APICBASE, %ecx
+ rdmsr
+ testl $X2APIC_ENABLE, %eax
+ jnz .Lread_apicid_msr
+
+ /* Read the APIC ID from the fix-mapped MMIO space. */
+ movq apic_mmio_base(%rip), %rcx
+ addq $APIC_ID, %rcx
+ movl (%rcx), %eax
+ shr $24, %eax
+ jmp .Llookup_AP
+
+.Lread_apicid_msr:
+ mov $APIC_X2APIC_ID_MSR, %ecx
+ rdmsr
+
+.Llookup_AP:
+ /* EAX contains the APIC ID of the current CPU */
+ xorq %rcx, %rcx
+ leaq cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+ cmpl (%rbx,%rcx,4), %eax
+ jz .Lsetup_cpu
+ inc %ecx
+#ifdef CONFIG_FORCE_NR_CPUS
+ cmpl $NR_CPUS, %ecx
+#else
+ cmpl nr_cpu_ids(%rip), %ecx
+#endif
+ jb .Lfind_cpunr
+
+ /* APIC ID not found in the table. Drop the trampoline lock and bail. */
+ movq trampoline_lock(%rip), %rax
+ movl $0, (%rax)
+
+1: cli
+ hlt
+ jmp 1b
+
+.Lsetup_cpu:
/* Get the per cpu offset for the given CPU# which is in ECX */
movq __per_cpu_offset(,%rcx,8), %rdx
#else
@@ -252,6 +313,16 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movq TASK_threadsp(%rax), %rsp
/*
+ * Now that this CPU is running on its own stack, drop the realmode
+ * protection. For the boot CPU the pointer is NULL!
+ */
+ movq trampoline_lock(%rip), %rax
+ testq %rax, %rax
+ jz .Lsetup_gdt
+ movl $0, (%rax)
+
+.Lsetup_gdt:
+ /*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
@@ -375,13 +446,13 @@ SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
#include "sev_verify_cbit.S"
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
/*
- * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
- * up already except stack. We just set up stack here. Then call
- * start_secondary() via .Ljump_to_C_code.
+ * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for
+ * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot
+ * unplug. Everything is set up already except the stack.
*/
-SYM_CODE_START(start_cpu0)
+SYM_CODE_START(soft_restart_cpu)
ANNOTATE_NOENDBR
UNWIND_HINT_END_OF_STACK
@@ -390,7 +461,7 @@ SYM_CODE_START(start_cpu0)
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
-SYM_CODE_END(start_cpu0)
+SYM_CODE_END(soft_restart_cpu)
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -433,6 +504,8 @@ SYM_DATA(initial_code, .quad x86_64_start_kernel)
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
+
+SYM_DATA(trampoline_lock, .quad 0);
__FINITDATA
__INIT
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 766ffe3ba313..9f668d2f3d11 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -211,6 +211,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
#endif
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ sum += irq_stats(cpu)->irq_hv_callback_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ sum += irq_stats(cpu)->irq_hv_reenlightenment_count;
+ sum += irq_stats(cpu)->hyperv_stimer0_count;
+#endif
#ifdef CONFIG_X86_MCE
sum += per_cpu(mce_exception_count, cpu);
sum += per_cpu(mce_poll_count, cpu);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dac41a0072ea..ff9b80a0e3e3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -759,15 +759,26 @@ bool xen_set_default_idle(void)
}
#endif
+struct cpumask cpus_stop_mask;
+
void __noreturn stop_this_cpu(void *dummy)
{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+ unsigned int cpu = smp_processor_id();
+
local_irq_disable();
+
/*
- * Remove this CPU:
+ * Remove this CPU from the online mask and disable it
+ * unconditionally. This might be redundant in case that the reboot
+ * vector was handled late and stop_other_cpus() sent an NMI.
+ *
+ * According to SDM and APM NMIs can be accepted even after soft
+ * disabling the local APIC.
*/
- set_cpu_online(smp_processor_id(), false);
+ set_cpu_online(cpu, false);
disable_local_APIC();
- mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ mcheck_cpu_clear(c);
/*
* Use wbinvd on processors that support SME. This provides support
@@ -781,8 +792,17 @@ void __noreturn stop_this_cpu(void *dummy)
* Test the CPUID bit directly because the machine might've cleared
* X86_FEATURE_SME due to cmdline options.
*/
- if (cpuid_eax(0x8000001f) & BIT(0))
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
native_wbinvd();
+
+ /*
+ * This brings a cache line back and dirties it, but
+ * native_stop_other_cpus() will overwrite cpus_stop_mask after it
+ * observed that all CPUs reported stop. This write will invalidate
+ * the related cache line on this CPU.
+ */
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
for (;;) {
/*
* Use native_halt() so that memory contents don't change
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 3a5b0c9c4fcc..2eabccde94fb 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -12,6 +12,9 @@
#ifndef __BOOT_COMPRESSED
#define error(v) pr_err(v)
#define has_cpuflag(f) boot_cpu_has(f)
+#else
+#undef WARN
+#define WARN(condition, format...) (!!(condition))
#endif
/* I/O parameters for CPUID-related helpers */
@@ -991,3 +994,103 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
cpuid_ext_range_max = fn->eax;
}
}
+
+static void pvalidate_pages(struct snp_psc_desc *desc)
+{
+ struct psc_entry *e;
+ unsigned long vaddr;
+ unsigned int size;
+ unsigned int i;
+ bool validate;
+ int rc;
+
+ for (i = 0; i <= desc->hdr.end_entry; i++) {
+ e = &desc->entries[i];
+
+ vaddr = (unsigned long)pfn_to_kaddr(e->gfn);
+ size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ validate = e->operation == SNP_PAGE_STATE_PRIVATE;
+
+ rc = pvalidate(vaddr, size, validate);
+ if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
+ unsigned long vaddr_end = vaddr + PMD_SIZE;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
+ rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (rc)
+ break;
+ }
+ }
+
+ if (rc) {
+ WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc);
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ }
+ }
+}
+
+static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
+{
+ int cur_entry, end_entry, ret = 0;
+ struct snp_psc_desc *data;
+ struct es_em_ctxt ctxt;
+
+ vc_ghcb_invalidate(ghcb);
+
+ /* Copy the input desc into GHCB shared buffer */
+ data = (struct snp_psc_desc *)ghcb->shared_buffer;
+ memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
+
+ /*
+ * As per the GHCB specification, the hypervisor can resume the guest
+ * before processing all the entries. Check whether all the entries
+ * are processed. If not, then keep retrying. Note, the hypervisor
+ * will update the data memory directly to indicate the status, so
+ * reference the data->hdr everywhere.
+ *
+ * The strategy here is to wait for the hypervisor to change the page
+ * state in the RMP table before guest accesses the memory pages. If the
+ * page state change was not successful, then later memory access will
+ * result in a crash.
+ */
+ cur_entry = data->hdr.cur_entry;
+ end_entry = data->hdr.end_entry;
+
+ while (data->hdr.cur_entry <= data->hdr.end_entry) {
+ ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
+
+ /* This will advance the shared buffer data points to. */
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
+
+ /*
+ * Page State Change VMGEXIT can pass error code through
+ * exit_info_2.
+ */
+ if (WARN(ret || ghcb->save.sw_exit_info_2,
+ "SNP: PSC failed ret=%d exit_info_2=%llx\n",
+ ret, ghcb->save.sw_exit_info_2)) {
+ ret = 1;
+ goto out;
+ }
+
+ /* Verify that reserved bit is not set */
+ if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Sanity check that entry processing is not going backwards.
+ * This will happen only if hypervisor is tricking us.
+ */
+ if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
+"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
+ end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index b031244d6d2d..a0af3908ed49 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -119,7 +119,19 @@ static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
struct sev_config {
__u64 debug : 1,
- __reserved : 63;
+
+ /*
+ * A flag used by __set_pages_state() that indicates when the
+ * per-CPU GHCB has been created and registered and thus can be
+ * used by the BSP instead of the early boot GHCB.
+ *
+ * For APs, the per-CPU GHCB is created before they are started
+ * and registered upon startup, so this flag can be used globally
+ * for the BSP and APs.
+ */
+ ghcbs_initialized : 1,
+
+ __reserved : 62;
};
static struct sev_config sev_cfg __read_mostly;
@@ -645,32 +657,26 @@ static u64 __init get_jump_table_addr(void)
return ret;
}
-static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate)
-{
- unsigned long vaddr_end;
- int rc;
-
- vaddr = vaddr & PAGE_MASK;
- vaddr_end = vaddr + (npages << PAGE_SHIFT);
-
- while (vaddr < vaddr_end) {
- rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
- if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
-
- vaddr = vaddr + PAGE_SIZE;
- }
-}
-
-static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
+static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
{
unsigned long paddr_end;
u64 val;
+ int ret;
+
+ vaddr = vaddr & PAGE_MASK;
paddr = paddr & PAGE_MASK;
paddr_end = paddr + (npages << PAGE_SHIFT);
while (paddr < paddr_end) {
+ if (op == SNP_PAGE_STATE_SHARED) {
+ /* Page validation must be rescinded before changing to shared */
+ ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false);
+ if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
+ goto e_term;
+ }
+
/*
* Use the MSR protocol because this function can be called before
* the GHCB is established.
@@ -691,7 +697,15 @@ static void __init early_set_pages_state(unsigned long paddr, unsigned int npage
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
goto e_term;
- paddr = paddr + PAGE_SIZE;
+ if (op == SNP_PAGE_STATE_PRIVATE) {
+ /* Page validation must be performed after changing to private */
+ ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true);
+ if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
+ goto e_term;
+ }
+
+ vaddr += PAGE_SIZE;
+ paddr += PAGE_SIZE;
}
return;
@@ -701,7 +715,7 @@ e_term:
}
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned int npages)
+ unsigned long npages)
{
/*
* This can be invoked in early boot while running identity mapped, so
@@ -716,14 +730,11 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
* Ask the hypervisor to mark the memory pages as private in the RMP
* table.
*/
- early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE);
-
- /* Validate the memory pages after they've been added in the RMP table. */
- pvalidate_pages(vaddr, npages, true);
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
}
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned int npages)
+ unsigned long npages)
{
/*
* This can be invoked in early boot while running identity mapped, so
@@ -734,11 +745,8 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
- /* Invalidate the memory pages before they are marked shared in the RMP table. */
- pvalidate_pages(vaddr, npages, false);
-
/* Ask hypervisor to mark the memory pages shared in the RMP table. */
- early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED);
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
}
void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
@@ -756,96 +764,16 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op
WARN(1, "invalid memory op %d\n", op);
}
-static int vmgexit_psc(struct snp_psc_desc *desc)
+static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
+ unsigned long vaddr_end, int op)
{
- int cur_entry, end_entry, ret = 0;
- struct snp_psc_desc *data;
struct ghcb_state state;
- struct es_em_ctxt ctxt;
- unsigned long flags;
- struct ghcb *ghcb;
-
- /*
- * __sev_get_ghcb() needs to run with IRQs disabled because it is using
- * a per-CPU GHCB.
- */
- local_irq_save(flags);
-
- ghcb = __sev_get_ghcb(&state);
- if (!ghcb) {
- ret = 1;
- goto out_unlock;
- }
-
- /* Copy the input desc into GHCB shared buffer */
- data = (struct snp_psc_desc *)ghcb->shared_buffer;
- memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
-
- /*
- * As per the GHCB specification, the hypervisor can resume the guest
- * before processing all the entries. Check whether all the entries
- * are processed. If not, then keep retrying. Note, the hypervisor
- * will update the data memory directly to indicate the status, so
- * reference the data->hdr everywhere.
- *
- * The strategy here is to wait for the hypervisor to change the page
- * state in the RMP table before guest accesses the memory pages. If the
- * page state change was not successful, then later memory access will
- * result in a crash.
- */
- cur_entry = data->hdr.cur_entry;
- end_entry = data->hdr.end_entry;
-
- while (data->hdr.cur_entry <= data->hdr.end_entry) {
- ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
-
- /* This will advance the shared buffer data points to. */
- ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
-
- /*
- * Page State Change VMGEXIT can pass error code through
- * exit_info_2.
- */
- if (WARN(ret || ghcb->save.sw_exit_info_2,
- "SNP: PSC failed ret=%d exit_info_2=%llx\n",
- ret, ghcb->save.sw_exit_info_2)) {
- ret = 1;
- goto out;
- }
-
- /* Verify that reserved bit is not set */
- if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
- ret = 1;
- goto out;
- }
-
- /*
- * Sanity check that entry processing is not going backwards.
- * This will happen only if hypervisor is tricking us.
- */
- if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
-"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
- end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
- ret = 1;
- goto out;
- }
- }
-
-out:
- __sev_put_ghcb(&state);
-
-out_unlock:
- local_irq_restore(flags);
-
- return ret;
-}
-
-static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
- unsigned long vaddr_end, int op)
-{
+ bool use_large_entry;
struct psc_hdr *hdr;
struct psc_entry *e;
+ unsigned long flags;
unsigned long pfn;
+ struct ghcb *ghcb;
int i;
hdr = &data->hdr;
@@ -854,74 +782,104 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
memset(data, 0, sizeof(*data));
i = 0;
- while (vaddr < vaddr_end) {
- if (is_vmalloc_addr((void *)vaddr))
+ while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
+ hdr->end_entry = i;
+
+ if (is_vmalloc_addr((void *)vaddr)) {
pfn = vmalloc_to_pfn((void *)vaddr);
- else
+ use_large_entry = false;
+ } else {
pfn = __pa(vaddr) >> PAGE_SHIFT;
+ use_large_entry = true;
+ }
e->gfn = pfn;
e->operation = op;
- hdr->end_entry = i;
- /*
- * Current SNP implementation doesn't keep track of the RMP page
- * size so use 4K for simplicity.
- */
- e->pagesize = RMP_PG_SIZE_4K;
+ if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
+ (vaddr_end - vaddr) >= PMD_SIZE) {
+ e->pagesize = RMP_PG_SIZE_2M;
+ vaddr += PMD_SIZE;
+ } else {
+ e->pagesize = RMP_PG_SIZE_4K;
+ vaddr += PAGE_SIZE;
+ }
- vaddr = vaddr + PAGE_SIZE;
e++;
i++;
}
- if (vmgexit_psc(data))
+ /* Page validation must be rescinded before changing to shared */
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_pages(data);
+
+ local_irq_save(flags);
+
+ if (sev_cfg.ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else
+ ghcb = boot_ghcb;
+
+ /* Invoke the hypervisor to perform the page state changes */
+ if (!ghcb || vmgexit_psc(ghcb, data))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+ if (sev_cfg.ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ /* Page validation must be performed after changing to private */
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_pages(data);
+
+ return vaddr;
}
-static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
+static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
{
- unsigned long vaddr_end, next_vaddr;
- struct snp_psc_desc *desc;
+ struct snp_psc_desc desc;
+ unsigned long vaddr_end;
- desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
- if (!desc)
- panic("SNP: failed to allocate memory for PSC descriptor\n");
+ /* Use the MSR protocol when a GHCB is not available. */
+ if (!boot_ghcb)
+ return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
vaddr = vaddr & PAGE_MASK;
vaddr_end = vaddr + (npages << PAGE_SHIFT);
- while (vaddr < vaddr_end) {
- /* Calculate the last vaddr that fits in one struct snp_psc_desc. */
- next_vaddr = min_t(unsigned long, vaddr_end,
- (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
-
- __set_pages_state(desc, vaddr, next_vaddr, op);
-
- vaddr = next_vaddr;
- }
-
- kfree(desc);
+ while (vaddr < vaddr_end)
+ vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);
}
-void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
+void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
{
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
- pvalidate_pages(vaddr, npages, false);
-
set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
}
-void snp_set_memory_private(unsigned long vaddr, unsigned int npages)
+void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
{
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
+}
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ unsigned long vaddr;
+ unsigned int npages;
- pvalidate_pages(vaddr, npages, true);
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ vaddr = (unsigned long)__va(start);
+ npages = (end - start) >> PAGE_SHIFT;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
}
static int snp_set_vmsa(void *va, bool vmsa)
@@ -1267,6 +1225,8 @@ void setup_ghcb(void)
if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
snp_register_per_cpu_ghcb();
+ sev_cfg.ghcbs_initialized = true;
+
return;
}
@@ -1328,7 +1288,7 @@ static void sev_es_play_dead(void)
* If we get here, the VCPU was woken up again. Jump to CPU
* startup code to get it back online.
*/
- start_cpu0();
+ soft_restart_cpu();
}
#else /* CONFIG_HOTPLUG_CPU */
#define sev_es_play_dead native_play_dead
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 004cb30b7419..cfeec3ee877e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -182,7 +182,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
static unsigned long __ro_after_init max_frame_size;
static unsigned int __ro_after_init fpu_default_state_size;
-void __init init_sigframe_size(void)
+static int __init init_sigframe_size(void)
{
fpu_default_state_size = fpu__get_fpstate_size();
@@ -194,7 +194,9 @@ void __init init_sigframe_size(void)
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
pr_info("max sigframe size: %lu\n", max_frame_size);
+ return 0;
}
+early_initcall(init_sigframe_size);
unsigned long get_sigframe_size(void)
{
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 375b33ecafa2..7eb18ca7bd45 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,12 +21,14 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
@@ -129,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
}
/*
- * this function calls the 'stop' function on all other CPUs in the system.
+ * Disable virtualization, APIC etc. and park the CPU in a HLT loop
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
@@ -146,61 +148,96 @@ static int register_stop_handler(void)
static void native_stop_other_cpus(int wait)
{
- unsigned long flags;
- unsigned long timeout;
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags, timeout;
if (reboot_force)
return;
- /*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- */
+ /* Only proceed if this is the first CPU to reach this code */
+ if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
+ return;
+
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
/*
- * We start by using the REBOOT_VECTOR irq.
- * The irq is treated as a sync point to allow critical
- * regions of code on other cpus to release their spin locks
- * and re-enable irqs. Jumping straight to an NMI might
- * accidentally cause deadlocks with further shutdown/panic
- * code. By syncing, we give the cpus up to one second to
- * finish their work before we force them off with the NMI.
+ * 1) Send an IPI on the reboot vector to all other CPUs.
+ *
+ * The other CPUs should react on it after leaving critical
+ * sections and re-enabling interrupts. They might still hold
+ * locks, but there is nothing which can be done about that.
+ *
+ * 2) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * 3) If the system uses INIT/STARTUP for CPU bringup, then
+ * send all present CPUs an INIT vector, which brings them
+ * completely out of the way.
+ *
+ * 4) If #3 is not possible and #2 timed out send an NMI to the
+ * CPUs which did not yet report
+ *
+ * 5) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * #4 can obviously race against a CPU reaching the HLT loop late.
+ * That CPU will have reported already and the "have all CPUs
+ * reached HLT" condition will be true despite the fact that the
+ * other CPU is still handling the NMI. Again, there is no
+ * protection against that as "disabled" APICs still respond to
+ * NMIs.
*/
- if (num_online_cpus() > 1) {
- /* did someone beat us here? */
- if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
- return;
-
- /* sync above data before sending IRQ */
- wmb();
+ cpumask_copy(&cpus_stop_mask, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+ if (!cpumask_empty(&cpus_stop_mask)) {
apic_send_IPI_allbutself(REBOOT_VECTOR);
/*
* Don't wait longer than a second for IPI completion. The
* wait request is not checked here because that would
- * prevent an NMI shutdown attempt in case that not all
+ * prevent an NMI/INIT shutdown in case that not all
* CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
- while (num_online_cpus() > 1 && timeout--)
+ while (!cpumask_empty(&cpus_stop_mask) && timeout--)
udelay(1);
}
- /* if the REBOOT_VECTOR didn't work, try with the NMI */
- if (num_online_cpus() > 1) {
+ /*
+ * Park all other CPUs in INIT including "offline" CPUs, if
+ * possible. That's a safe place where they can't resume execution
+ * of HLT and then execute the HLT loop from overwritten text or
+ * page tables.
+ *
+ * The only downside is a broadcast MCE, but up to the point where
+ * the kexec() kernel brought all APs online again an MCE will just
+ * make HLT resume and handle the MCE. The machine crashes and burns
+ * due to overwritten text, page tables and data. So there is a
+ * choice between fire and frying pan. The result is pretty much
+ * the same. Chose frying pan until x86 provides a sane mechanism
+ * to park a CPU.
+ */
+ if (smp_park_other_cpus_in_init())
+ goto done;
+
+ /*
+ * If park with INIT was not possible and the REBOOT_VECTOR didn't
+ * take all secondary CPUs offline, try with the NMI.
+ */
+ if (!cpumask_empty(&cpus_stop_mask)) {
/*
* If NMI IPI is enabled, try to register the stop handler
* and send the IPI. In any case try to wait for the other
* CPUs to stop.
*/
if (!smp_no_nmi_ipi && !register_stop_handler()) {
- /* Sync above data before sending IRQ */
- wmb();
-
pr_emerg("Shutting down cpus with NMI\n");
- apic_send_IPI_allbutself(NMI_VECTOR);
+ for_each_cpu(cpu, &cpus_stop_mask)
+ apic->send_IPI(cpu, NMI_VECTOR);
}
/*
* Don't wait longer than 10 ms if the caller didn't
@@ -208,14 +245,21 @@ static void native_stop_other_cpus(int wait)
* one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
- while (num_online_cpus() > 1 && (wait || timeout--))
+ while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
udelay(1);
}
+done:
local_irq_save(flags);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
local_irq_restore(flags);
+
+ /*
+ * Ensure that the cpus_stop_mask cache lines are invalidated on
+ * the other CPUs. See comment vs. SME in stop_this_cpu().
+ */
+ cpumask_clear(&cpus_stop_mask);
}
/*
@@ -268,8 +312,7 @@ struct smp_ops smp_ops = {
#endif
.smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
+ .kick_ap_alive = native_kick_ap,
.cpu_disable = native_cpu_disable,
.play_dead = native_play_dead,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 352f0ce1ece4..8779a7ed3e87 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,10 +53,13 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/stackprotector.h>
+#include <linux/cpuhotplug.h>
+#include <linux/mc146818rtc.h>
#include <asm/acpi.h>
#include <asm/cacheinfo.h>
@@ -74,7 +77,7 @@
#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
-#include <linux/mc146818rtc.h>
+#include <asm/microcode.h>
#include <asm/i8259.h>
#include <asm/misc.h>
#include <asm/qspinlock.h>
@@ -101,6 +104,26 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
+/* Representing CPUs for which sibling maps can be computed */
+static cpumask_var_t cpu_sibling_setup_mask;
+
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
+
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
+
/* Logical package management. We might want to allocate that dynamically */
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
@@ -121,7 +144,6 @@ int arch_update_cpu_topology(void)
return retval;
}
-
static unsigned int smpboot_warm_reset_vector_count;
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
@@ -154,66 +176,63 @@ static inline void smpboot_restore_warm_reset_vector(void)
}
-/*
- * Report back to the Boot Processor during boot time or to the caller processor
- * during CPU online.
- */
-static void smp_callin(void)
+/* Run the next set of setup steps for the upcoming CPU */
+static void ap_starting(void)
{
- int cpuid;
+ int cpuid = smp_processor_id();
- /*
- * If waken up by an INIT in an 82489DX configuration
- * cpu_callout_mask guarantees we don't get here before
- * an INIT_deassert IPI reaches our local APIC, so it is
- * now safe to touch our local APIC.
- */
- cpuid = smp_processor_id();
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
/*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
+ * If woken up by an INIT in an 82489DX configuration the alive
+ * synchronization guarantees that the CPU does not reach this
+ * point before an INIT_deassert IPI reaches the local APIC, so it
+ * is now safe to touch the local APIC.
+ *
+ * Set up this CPU, first the APIC, which is probably redundant on
+ * most boards.
*/
apic_ap_setup();
- /*
- * Save our processor parameters. Note: this information
- * is needed for clock calibration.
- */
+ /* Save the processor parameters. */
smp_store_cpu_info(cpuid);
/*
* The topology information must be up to date before
- * calibrate_delay() and notify_cpu_starting().
+ * notify_cpu_starting().
*/
- set_cpu_sibling_map(raw_smp_processor_id());
+ set_cpu_sibling_map(cpuid);
ap_init_aperfmperf();
- /*
- * Get our bogomips.
- * Update loops_per_jiffy in cpu_data. Previous call to
- * smp_store_cpu_info() stored a value that is close but not as
- * accurate as the value just calculated.
- */
- calibrate_delay();
- cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
pr_debug("Stack at about %p\n", &cpuid);
wmb();
+ /*
+ * This runs the AP through all the cpuhp states to its target
+ * state CPUHP_ONLINE.
+ */
notify_cpu_starting(cpuid);
+}
+static void ap_calibrate_delay(void)
+{
/*
- * Allow the master to continue.
+ * Calibrate the delay loop and update loops_per_jiffy in cpu_data.
+ * smp_store_cpu_info() stored a value that is close but not as
+ * accurate as the value just calculated.
+ *
+ * As this is invoked after the TSC synchronization check,
+ * calibrate_delay_is_known() will skip the calibration routine
+ * when TSC is synchronized across sockets.
*/
- cpumask_set_cpu(cpuid, cpu_callin_mask);
+ calibrate_delay();
+ cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
}
-static int cpu0_logical_apicid;
-static int enable_start_cpu0;
/*
* Activate a secondary processor.
*/
@@ -226,24 +245,63 @@ static void notrace start_secondary(void *unused)
*/
cr4_init();
-#ifdef CONFIG_X86_32
- /* switch away from the initial page table */
- load_cr3(swapper_pg_dir);
- __flush_tlb_all();
-#endif
- cpu_init_secondary();
+ /*
+ * 32-bit specific. 64-bit reaches this code with the correct page
+ * table established. Yet another historical divergence.
+ */
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+ }
+
+ cpu_init_exception_handling();
+
+ /*
+ * 32-bit systems load the microcode from the ASM startup code for
+ * historical reasons.
+ *
+ * On 64-bit systems load it before reaching the AP alive
+ * synchronization point below so it is not part of the full per
+ * CPU serialized bringup part when "parallel" bringup is enabled.
+ *
+ * That's even safe when hyperthreading is enabled in the CPU as
+ * the core code starts the primary threads first and leaves the
+ * secondary threads waiting for SIPI. Loading microcode on
+ * physical cores concurrently is a safe operation.
+ *
+ * This covers both the Intel specific issue that concurrent
+ * microcode loading on SMT siblings must be prohibited and the
+ * vendor independent issue`that microcode loading which changes
+ * CPUID, MSRs etc. must be strictly serialized to maintain
+ * software state correctness.
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ load_ucode_ap();
+
+ /*
+ * Synchronization point with the hotplug core. Sets this CPUs
+ * synchronization state to ALIVE and spin-waits for the control CPU to
+ * release this CPU for further bringup.
+ */
+ cpuhp_ap_sync_alive();
+
+ cpu_init();
+ fpu__init_cpu();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- smp_callin();
- enable_start_cpu0 = 0;
+ ap_starting();
+
+ /* Check TSC synchronization with the control CPU. */
+ check_tsc_sync_target();
- /* otherwise gcc will move up smp_processor_id before the cpu_init */
- barrier();
/*
- * Check TSC synchronization with the boot CPU:
+ * Calibrate the delay loop after the TSC synchronization check.
+ * This allows to skip the calibration when TSC is synchronized
+ * across sockets.
*/
- check_tsc_sync_target();
+ ap_calibrate_delay();
speculative_store_bypass_ht_init();
@@ -257,7 +315,6 @@ static void notrace start_secondary(void *unused)
set_cpu_online(smp_processor_id(), true);
lapic_online();
unlock_vector_lock();
- cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
/* enable local interrupts */
@@ -270,15 +327,6 @@ static void notrace start_secondary(void *unused)
}
/**
- * topology_is_primary_thread - Check whether CPU is the primary SMT thread
- * @cpu: CPU to check
- */
-bool topology_is_primary_thread(unsigned int cpu)
-{
- return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
-}
-
-/**
* topology_smt_supported - Check whether SMT is supported by the CPUs
*/
bool topology_smt_supported(void)
@@ -288,6 +336,7 @@ bool topology_smt_supported(void)
/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
+ * @phys_pkg: The physical package id to map
*
* Returns logical package id or -1 if not found
*/
@@ -304,15 +353,17 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
return -1;
}
EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
/**
* topology_phys_to_logical_die - Map a physical die id to logical
+ * @die_id: The physical die id to map
+ * @cur_cpu: The CPU for which the mapping is done
*
* Returns logical die id or -1 if not found
*/
-int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
+static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
{
- int cpu;
- int proc_id = cpu_data(cur_cpu).phys_proc_id;
+ int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id;
for_each_possible_cpu(cpu) {
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -323,7 +374,6 @@ int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
}
return -1;
}
-EXPORT_SYMBOL(topology_phys_to_logical_die);
/**
* topology_update_package_map - Update the physical to logical package map
@@ -398,7 +448,7 @@ void smp_store_cpu_info(int id)
c->cpu_index = id;
/*
* During boot time, CPU0 has this setup already. Save the info when
- * bringing up AP or offlined CPU0.
+ * bringing up an AP.
*/
identify_secondary_cpu(c);
c->initialized = true;
@@ -706,9 +756,9 @@ static void impress_friends(void)
* Allow the user to impress friends.
*/
pr_debug("Before bogomips\n");
- for_each_possible_cpu(cpu)
- if (cpumask_test_cpu(cpu, cpu_callout_mask))
- bogosum += cpu_data(cpu).loops_per_jiffy;
+ for_each_online_cpu(cpu)
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+
pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
num_online_cpus(),
bogosum/(500000/HZ),
@@ -795,86 +845,42 @@ static void __init smp_quirk_init_udelay(void)
}
/*
- * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- * won't ... remember to clear down the APIC, etc later.
+ * Wake up AP by INIT, INIT, STARTUP sequence.
*/
-int
-wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
+static void send_init_sequence(int phys_apicid)
{
- u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
- unsigned long send_status, accept_status = 0;
- int maxlvt;
+ int maxlvt = lapic_get_maxlvt();
- /* Target chip */
- /* Boot on the stack */
- /* Kick the second */
- apic_icr_write(APIC_DM_NMI | dm, apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
+ /* Be paranoid about clearing APIC errors. */
if (APIC_INTEGRATED(boot_cpu_apic_version)) {
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
apic_write(APIC_ESR, 0);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
+ apic_read(APIC_ESR);
}
- pr_debug("NMI sent\n");
- if (send_status)
- pr_err("APIC never delivered???\n");
- if (accept_status)
- pr_err("APIC delivery error (%lx)\n", accept_status);
+ /* Assert INIT on the target CPU */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
- return (send_status | accept_status);
+ udelay(init_udelay);
+
+ /* Deassert INIT on the target CPU */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
}
-static int
-wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status = 0, accept_status = 0;
- int maxlvt, num_starts, j;
+ int num_starts, j, maxlvt;
+ preempt_disable();
maxlvt = lapic_get_maxlvt();
-
- /*
- * Be paranoid about clearing APIC errors.
- */
- if (APIC_INTEGRATED(boot_cpu_apic_version)) {
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- }
-
- pr_debug("Asserting INIT\n");
-
- /*
- * Turn INIT on target chip
- */
- /*
- * Send IPI
- */
- apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
- phys_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- udelay(init_udelay);
-
- pr_debug("Deasserting INIT\n");
-
- /* Target chip */
- /* Send IPI */
- apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+ send_init_sequence(phys_apicid);
mb();
@@ -945,15 +951,16 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
if (accept_status)
pr_err("APIC delivery error (%lx)\n", accept_status);
+ preempt_enable();
return (send_status | accept_status);
}
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
+ static int width, node_width, first = 1;
static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
- static int width, node_width;
if (!width)
width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
@@ -961,10 +968,10 @@ static void announce_cpu(int cpu, int apicid)
if (!node_width)
node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
- if (cpu == 1)
- printk(KERN_INFO "x86: Booting SMP configuration:\n");
-
if (system_state < SYSTEM_RUNNING) {
+ if (first)
+ pr_info("x86: Booting SMP configuration:\n");
+
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
@@ -975,77 +982,16 @@ static void announce_cpu(int cpu, int apicid)
}
/* Add padding for the BSP */
- if (cpu == 1)
+ if (first)
pr_cont("%*s", width + 1, " ");
+ first = 0;
pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
-
} else
pr_info("Booting Node %d Processor %d APIC 0x%x\n",
node, cpu, apicid);
}
-static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
-{
- int cpu;
-
- cpu = smp_processor_id();
- if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
- return NMI_HANDLED;
-
- return NMI_DONE;
-}
-
-/*
- * Wake up AP by INIT, INIT, STARTUP sequence.
- *
- * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
- * boot-strap code which is not a desired behavior for waking up BSP. To
- * void the boot-strap code, wake up CPU0 by NMI instead.
- *
- * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
- * (i.e. physically hot removed and then hot added), NMI won't wake it up.
- * We'll change this code in the future to wake up hard offlined CPU0 if
- * real platform and request are available.
- */
-static int
-wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
- int *cpu0_nmi_registered)
-{
- int id;
- int boot_error;
-
- preempt_disable();
-
- /*
- * Wake up AP by INIT, INIT, STARTUP sequence.
- */
- if (cpu) {
- boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
- goto out;
- }
-
- /*
- * Wake up BSP by nmi.
- *
- * Register a NMI handler to help wake up CPU0.
- */
- boot_error = register_nmi_handler(NMI_LOCAL,
- wakeup_cpu0_nmi, 0, "wake_cpu0");
-
- if (!boot_error) {
- enable_start_cpu0 = 1;
- *cpu0_nmi_registered = 1;
- id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
- boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
- }
-
-out:
- preempt_enable();
-
- return boot_error;
-}
-
int common_cpu_up(unsigned int cpu, struct task_struct *idle)
{
int ret;
@@ -1071,17 +1017,13 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from
+ * Returns zero if startup was successfully sent, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
- int *cpu0_nmi_registered)
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
{
- /* start_ip had better be page-aligned! */
unsigned long start_ip = real_mode_header->trampoline_start;
-
- unsigned long boot_error = 0;
- unsigned long timeout;
+ int ret;
#ifdef CONFIG_X86_64
/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
@@ -1094,7 +1036,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
if (IS_ENABLED(CONFIG_X86_32)) {
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_stack = idle->thread.sp;
- } else {
+ } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
smpboot_control = cpu;
}
@@ -1108,7 +1050,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
* This grunge runs the startup process for
* the targeted processor.
*/
-
if (x86_platform.legacy.warm_reset) {
pr_debug("Setting warm reset code and vector.\n");
@@ -1123,13 +1064,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
}
}
- /*
- * AP might wait on cpu_callout_mask in cpu_init() with
- * cpu_initialized_mask set if previous attempt to online
- * it timed-out. Clear cpu_initialized_mask so that after
- * INIT/SIPI it could start with a clean state.
- */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
smp_mb();
/*
@@ -1137,66 +1071,25 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
* - Use a method from the APIC driver if one defined, with wakeup
* straight to 64-bit mode preferred over wakeup to RM.
* Otherwise,
- * - Use an INIT boot APIC message for APs or NMI for BSP.
+ * - Use an INIT boot APIC message
*/
if (apic->wakeup_secondary_cpu_64)
- boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip);
else if (apic->wakeup_secondary_cpu)
- boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip);
else
- boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
- cpu0_nmi_registered);
-
- if (!boot_error) {
- /*
- * Wait 10s total for first sign of life from AP
- */
- boot_error = -1;
- timeout = jiffies + 10*HZ;
- while (time_before(jiffies, timeout)) {
- if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
- /*
- * Tell AP to proceed with initialization
- */
- cpumask_set_cpu(cpu, cpu_callout_mask);
- boot_error = 0;
- break;
- }
- schedule();
- }
- }
-
- if (!boot_error) {
- /*
- * Wait till AP completes initial initialization
- */
- while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
- /*
- * Allow other tasks to run while we wait for the
- * AP to come online. This also gives a chance
- * for the MTRR work(triggered by the AP coming online)
- * to be completed in the stop machine context.
- */
- schedule();
- }
- }
-
- if (x86_platform.legacy.warm_reset) {
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
- }
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
- return boot_error;
+ /* If the wakeup mechanism failed, cleanup the warm reset vector */
+ if (ret)
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
}
-int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
- int cpu0_nmi_registered = 0;
- unsigned long flags;
- int err, ret = 0;
+ int err;
lockdep_assert_irqs_enabled();
@@ -1210,24 +1103,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
}
/*
- * Already booted CPU?
- */
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- pr_debug("do_boot_cpu %d Already started\n", cpu);
- return -ENOSYS;
- }
-
- /*
* Save current MTRR state in case it was changed since early boot
* (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
*/
mtrr_save_state();
- /* x86 CPUs take themselves offline, so delayed offline is OK. */
- err = cpu_check_up_prepare(cpu);
- if (err && err != -EBUSY)
- return err;
-
/* the FPU context is blank, nobody can own it */
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
@@ -1235,41 +1115,44 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
if (err)
return err;
- err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
- if (err) {
+ err = do_boot_cpu(apicid, cpu, tidle);
+ if (err)
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- ret = -EIO;
- goto unreg_nmi;
- }
- /*
- * Check TSC synchronization with the AP (keep irqs disabled
- * while doing so):
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
+ return err;
+}
- while (!cpu_online(cpu)) {
- cpu_relax();
- touch_nmi_watchdog();
- }
+int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
+{
+ return smp_ops.kick_ap_alive(cpu, tidle);
+}
-unreg_nmi:
- /*
- * Clean up the nmi handler. Do this after the callin and callout sync
- * to avoid impact of possible long unregister time.
- */
- if (cpu0_nmi_registered)
- unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
+{
+ /* Cleanup possible dangling ends... */
+ if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
+ smpboot_restore_warm_reset_vector();
+}
- return ret;
+void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
+{
+ if (smp_ops.cleanup_dead_cpu)
+ smp_ops.cleanup_dead_cpu(cpu);
+
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+}
+
+void arch_cpuhp_sync_state_poll(void)
+{
+ if (smp_ops.poll_sync_state)
+ smp_ops.poll_sync_state();
}
/**
- * arch_disable_smp_support() - disables SMP support for x86 at runtime
+ * arch_disable_smp_support() - Disables SMP support for x86 at boottime
*/
-void arch_disable_smp_support(void)
+void __init arch_disable_smp_support(void)
{
disable_ioapic_support();
}
@@ -1361,14 +1244,6 @@ static void __init smp_cpu_index_default(void)
}
}
-static void __init smp_get_logical_apicid(void)
-{
- if (x2apic_mode)
- cpu0_logical_apicid = apic_read(APIC_LDR);
- else
- cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
-}
-
void __init smp_prepare_cpus_common(void)
{
unsigned int i;
@@ -1379,7 +1254,6 @@ void __init smp_prepare_cpus_common(void)
* Setup boot CPU information
*/
smp_store_boot_cpu_info(); /* Final full version of the data */
- cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
for_each_possible_cpu(i) {
@@ -1402,6 +1276,21 @@ void __init smp_prepare_cpus_common(void)
set_cpu_sibling_map(0);
}
+#ifdef CONFIG_X86_64
+/* Establish whether parallel bringup can be supported. */
+bool __init arch_cpuhp_init_parallel_bringup(void)
+{
+ if (!x86_cpuinit.parallel_bringup) {
+ pr_info("Parallel CPU startup disabled by the platform\n");
+ return false;
+ }
+
+ smpboot_control = STARTUP_READ_APICID;
+ pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
+ return true;
+}
+#endif
+
/*
* Prepare for SMP bootup.
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
@@ -1431,8 +1320,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
/* Setup local timer */
x86_init.timers.setup_percpu_clockev();
- smp_get_logical_apicid();
-
pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
@@ -1455,6 +1342,25 @@ void arch_thaw_secondary_cpus_end(void)
cache_aps_init();
}
+bool smp_park_other_cpus_in_init(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ unsigned int apicid;
+
+ if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu)
+ return false;
+
+ for_each_present_cpu(cpu) {
+ if (cpu == this_cpu)
+ continue;
+ apicid = apic->cpu_present_to_apicid(cpu);
+ if (apicid == BAD_APICID)
+ continue;
+ send_init_sequence(apicid);
+ }
+ return true;
+}
+
/*
* Early setup to make printk work.
*/
@@ -1466,9 +1372,6 @@ void __init native_smp_prepare_boot_cpu(void)
if (!IS_ENABLED(CONFIG_SMP))
switch_gdt_and_percpu_base(me);
- /* already set me in cpu_online_mask in boot_cpu_init() */
- cpumask_set_cpu(me, cpu_callout_mask);
- cpu_set_state_online(me);
native_pv_lock_init();
}
@@ -1592,6 +1495,12 @@ __init void prefill_possible_map(void)
set_cpu_possible(i, true);
}
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
/* Recompute SMT state for all CPUs on offline */
@@ -1650,10 +1559,6 @@ static void remove_siblinginfo(int cpu)
static void remove_cpu_from_maps(int cpu)
{
set_cpu_online(cpu, false);
- cpumask_clear_cpu(cpu, cpu_callout_mask);
- cpumask_clear_cpu(cpu, cpu_callin_mask);
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
numa_remove_cpu(cpu);
}
@@ -1704,64 +1609,27 @@ int native_cpu_disable(void)
return 0;
}
-int common_cpu_die(unsigned int cpu)
-{
- int ret = 0;
-
- /* We don't do anything here: idle task is faking death itself. */
-
- /* They ack this in play_dead() by setting CPU_DEAD */
- if (cpu_wait_death(cpu, 5)) {
- if (system_state == SYSTEM_RUNNING)
- pr_info("CPU %u is now offline\n", cpu);
- } else {
- pr_err("CPU %u didn't die...\n", cpu);
- ret = -1;
- }
-
- return ret;
-}
-
-void native_cpu_die(unsigned int cpu)
-{
- common_cpu_die(cpu);
-}
-
void play_dead_common(void)
{
idle_task_exit();
- /* Ack it */
- (void)cpu_report_death();
-
+ cpuhp_ap_report_dead();
/*
* With physical CPU hotplug, we should halt the cpu
*/
local_irq_disable();
}
-/**
- * cond_wakeup_cpu0 - Wake up CPU0 if needed.
- *
- * If NMI wants to wake up CPU0, start CPU0.
- */
-void cond_wakeup_cpu0(void)
-{
- if (smp_processor_id() == 0 && enable_start_cpu0)
- start_cpu0();
-}
-EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
-
/*
* We need to flush the caches before going to sleep, lest we have
* dirty data in our caches when we come back up.
*/
static inline void mwait_play_dead(void)
{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
- void *mwait_ptr;
int i;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
@@ -1796,12 +1664,9 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
- /*
- * This should be a memory location in a cache line which is
- * unlikely to be touched by other processors. The actual
- * content is immaterial as it is not actually modified in any way.
- */
- mwait_ptr = &current_thread_info()->flags;
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
wbinvd();
@@ -1814,13 +1679,58 @@ static inline void mwait_play_dead(void)
* case where we return around the loop.
*/
mb();
- clflush(mwait_ptr);
+ clflush(md);
mb();
- __monitor(mwait_ptr, 0, 0);
+ __monitor(md, 0, 0);
mb();
__mwait(eax, 0);
- cond_wakeup_cpu0();
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+ }
+}
+
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
}
}
@@ -1829,11 +1739,8 @@ void __noreturn hlt_play_dead(void)
if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
- while (1) {
+ while (1)
native_halt();
-
- cond_wakeup_cpu0();
- }
}
void native_play_dead(void)
@@ -1852,12 +1759,6 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_cpu_die(unsigned int cpu)
-{
- /* We said "no" in __cpu_disable */
- BUG();
-}
-
void native_play_dead(void)
{
BUG();
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 1b83377274b8..ca004e2e4469 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -38,102 +38,12 @@
static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
#ifdef CONFIG_HOTPLUG_CPU
-
-#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
-static int cpu0_hotpluggable = 1;
-#else
-static int cpu0_hotpluggable;
-static int __init enable_cpu0_hotplug(char *str)
-{
- cpu0_hotpluggable = 1;
- return 1;
-}
-
-__setup("cpu0_hotplug", enable_cpu0_hotplug);
-#endif
-
-#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
-/*
- * This function offlines a CPU as early as possible and allows userspace to
- * boot up without the CPU. The CPU can be onlined back by user after boot.
- *
- * This is only called for debugging CPU offline/online feature.
- */
-int _debug_hotplug_cpu(int cpu, int action)
+int arch_register_cpu(int cpu)
{
- int ret;
-
- if (!cpu_is_hotpluggable(cpu))
- return -EINVAL;
+ struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu);
- switch (action) {
- case 0:
- ret = remove_cpu(cpu);
- if (!ret)
- pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu);
- else
- pr_debug("Can't offline CPU%d.\n", cpu);
- break;
- case 1:
- ret = add_cpu(cpu);
- if (ret)
- pr_debug("Can't online CPU%d.\n", cpu);
-
- break;
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-static int __init debug_hotplug_cpu(void)
-{
- _debug_hotplug_cpu(0, 0);
- return 0;
-}
-
-late_initcall_sync(debug_hotplug_cpu);
-#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
-
-int arch_register_cpu(int num)
-{
- struct cpuinfo_x86 *c = &cpu_data(num);
-
- /*
- * Currently CPU0 is only hotpluggable on Intel platforms. Other
- * vendors can add hotplug support later.
- * Xen PV guests don't support CPU0 hotplug at all.
- */
- if (c->x86_vendor != X86_VENDOR_INTEL ||
- cpu_feature_enabled(X86_FEATURE_XENPV))
- cpu0_hotpluggable = 0;
-
- /*
- * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
- * depends on BSP. PIC interrupts depend on BSP.
- *
- * If the BSP dependencies are under control, one can tell kernel to
- * enable BSP hotplug. This basically adds a control file and
- * one can attempt to offline BSP.
- */
- if (num == 0 && cpu0_hotpluggable) {
- unsigned int irq;
- /*
- * We won't take down the boot processor on i386 if some
- * interrupts only are able to be serviced by the BSP in PIC.
- */
- for_each_active_irq(irq) {
- if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
- cpu0_hotpluggable = 0;
- break;
- }
- }
- }
- if (num || cpu0_hotpluggable)
- per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
-
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+ xc->cpu.hotpluggable = cpu > 0;
+ return register_cpu(&xc->cpu, cpu);
}
EXPORT_SYMBOL(arch_register_cpu);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 344698852146..1412b771651e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1598,10 +1598,7 @@ void __init tsc_init(void)
#ifdef CONFIG_SMP
/*
- * If we have a constant TSC and are using the TSC for the delay loop,
- * we can skip clock calibration if another cpu in the same socket has already
- * been calibrated. This assumes that CONSTANT_TSC applies to all
- * cpus in the socket - this should be a safe assumption.
+ * Check whether existing calibration data can be reused.
*/
unsigned long calibrate_delay_is_known(void)
{
@@ -1609,6 +1606,21 @@ unsigned long calibrate_delay_is_known(void)
int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
const struct cpumask *mask = topology_core_cpumask(cpu);
+ /*
+ * If TSC has constant frequency and TSC is synchronized across
+ * sockets then reuse CPU0 calibration.
+ */
+ if (constant_tsc && !tsc_unstable)
+ return cpu_data(0).loops_per_jiffy;
+
+ /*
+ * If TSC has constant frequency and TSC is not synchronized across
+ * sockets and this is not the first CPU in the socket, then reuse
+ * the calibration value of an already online CPU on that socket.
+ *
+ * This assumes that CONSTANT_TSC is consistent for all CPUs in a
+ * socket.
+ */
if (!constant_tsc || !mask)
return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9452dc9664b5..bbc440c93e08 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -245,7 +245,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
*/
static atomic_t start_count;
static atomic_t stop_count;
-static atomic_t skip_test;
static atomic_t test_runs;
/*
@@ -344,21 +343,14 @@ static inline unsigned int loop_timeout(int cpu)
}
/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
+ * The freshly booted CPU initiates this via an async SMP function call.
*/
-void check_tsc_sync_source(int cpu)
+static void check_tsc_sync_source(void *__cpu)
{
+ unsigned int cpu = (unsigned long)__cpu;
int cpus = 2;
/*
- * No need to check if we already know that the TSC is not
- * synchronized or if we have no TSC.
- */
- if (unsynchronized_tsc())
- return;
-
- /*
* Set the maximum number of test runs to
* 1 if the CPU does not provide the TSC_ADJUST MSR
* 3 if the MSR is available, so the target can try to adjust
@@ -368,16 +360,9 @@ void check_tsc_sync_source(int cpu)
else
atomic_set(&test_runs, 3);
retry:
- /*
- * Wait for the target to start or to skip the test:
- */
- while (atomic_read(&start_count) != cpus - 1) {
- if (atomic_read(&skip_test) > 0) {
- atomic_set(&skip_test, 0);
- return;
- }
+ /* Wait for the target to start. */
+ while (atomic_read(&start_count) != cpus - 1)
cpu_relax();
- }
/*
* Trigger the target to continue into the measurement too:
@@ -397,14 +382,14 @@ retry:
if (!nr_warps) {
atomic_set(&test_runs, 0);
- pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n",
smp_processor_id(), cpu);
} else if (atomic_dec_and_test(&test_runs) || random_warps) {
/* Force it to 0 if random warps brought us here */
atomic_set(&test_runs, 0);
- pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n",
smp_processor_id(), cpu);
pr_warn("Measured %Ld cycles TSC warp between CPUs, "
"turning off TSC clock.\n", max_warp);
@@ -457,11 +442,12 @@ void check_tsc_sync_target(void)
* SoCs the TSC is frequency synchronized, but still the TSC ADJUST
* register might have been wreckaged by the BIOS..
*/
- if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
- atomic_inc(&skip_test);
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
return;
- }
+ /* Kick the control CPU into the TSC synchronization function */
+ smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source,
+ (unsigned long *)(unsigned long)cpu, 0);
retry:
/*
* Register this CPU's participation and wait for the
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 3ac50b7298d1..4d8e518365f4 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -7,6 +7,9 @@
#include <asm/unwind.h>
#include <asm/orc_types.h>
#include <asm/orc_lookup.h>
+#include <asm/orc_header.h>
+
+ORC_HEADER;
#define orc_warn(fmt, ...) \
printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 25f155205770..03c885d3640f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -508,4 +508,8 @@ INIT_PER_CPU(irq_stack_backing_store);
"fixed_percpu_data is not at start of per-cpu area");
#endif
+#ifdef CONFIG_RETHUNK
+. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
+#endif
+
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 64664311ac2b..a37ebd3b4773 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -126,6 +126,7 @@ struct x86_init_ops x86_init __initdata = {
struct x86_cpuinit_ops x86_cpuinit = {
.early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
+ .parallel_bringup = true,
};
static void default_nmi_init(void) { };