diff options
Diffstat (limited to 'arch/x86')
87 files changed, 1188 insertions, 1270 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 469238067aa1..d3a43efac2d5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2072,7 +2072,7 @@ config ARCH_SUPPORTS_KEXEC def_bool y config ARCH_SUPPORTS_KEXEC_FILE - def_bool X86_64 && CRYPTO && CRYPTO_SHA256 + def_bool X86_64 config ARCH_SELECTS_KEXEC_FILE def_bool y @@ -2080,7 +2080,7 @@ config ARCH_SELECTS_KEXEC_FILE select HAVE_IMA_KEXEC if IMA config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SUPPORTS_KEXEC_SIG def_bool y diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 55c98fdd67d2..18d15d1ce87d 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -178,7 +178,7 @@ static unsigned long get_cmdline_acpi_rsdp(void) { unsigned long addr = 0; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE char val[MAX_ADDR_LEN] = { }; int ret; diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 473ba59b82a8..d040080d7edb 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -386,3 +386,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) */ kernel_add_identity_map(address, end); } + +void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) +{ + /* Empty handler to ignore NMI during early boot */ +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 3cdf94b41456..d100284bbef4 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -61,6 +61,7 @@ void load_stage2_idt(void) boot_idt_desc.address = (unsigned long)boot_idt; set_idt_entry(X86_TRAP_PF, boot_page_fault); + set_idt_entry(X86_TRAP_NMI, boot_nmi_trap); #ifdef CONFIG_AMD_MEM_ENCRYPT /* diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S index 22890e199f5b..4d03c8562f63 100644 --- a/arch/x86/boot/compressed/idt_handlers_64.S +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -70,6 +70,7 @@ SYM_FUNC_END(\name) .code64 EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 +EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0 #ifdef CONFIG_AMD_MEM_ENCRYPT EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index c0d502bd8716..bc2f0f17fb90 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -196,6 +196,7 @@ static inline void cleanup_exception_handling(void) { } /* IDT Entry Points */ void boot_page_fault(void); +void boot_nmi_trap(void); void boot_stage1_vc(void); void boot_stage2_vc(void); diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 40031a614712..5941f930f6c5 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -11,6 +11,7 @@ */ #include "boot.h" +#include <asm/desc_defs.h> #include <asm/segment.h> /* @@ -67,13 +68,13 @@ static void setup_gdt(void) being 8-byte unaligned. Intel recommends 16 byte alignment. */ static const u64 boot_gdt[] __attribute__((aligned(16))) = { /* CS: code, read/execute, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff), /* TSS: 32-bit tss, 104 bytes, base 4096 */ /* We only have a TSS here to keep Intel VT happy; we don't actually use it for anything. */ - [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103), }; /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead of the gdt_ptr contents. Thus, make it static so it will diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 1c8541ae3b3a..c23f3b9c84fe 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -49,7 +49,7 @@ int strcmp(const char *str1, const char *str2) { const unsigned char *s1 = (const unsigned char *)str1; const unsigned char *s2 = (const unsigned char *)str2; - int delta = 0; + int delta; while (*s1 || *s2) { delta = *s1 - *s2; diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1b5d17a9f70d..cf1f13c82175 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ #include <asm/coco.h> #include <asm/tdx.h> #include <asm/vmx.h> +#include <asm/ia32.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/pgtable.h> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d813160b14d8..6356060caaf3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include <xen/events.h> #endif +#include <asm/apic.h> #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> @@ -167,7 +168,96 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) } } -/* Handles int $0x80 */ +#ifdef CONFIG_IA32_EMULATION +static __always_inline bool int80_is_external(void) +{ + const unsigned int offs = (0x80 / 32) * 0x10; + const u32 bit = BIT(0x80 % 32); + + /* The local APIC on XENPV guests is fake */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* + * If vector 0x80 is set in the APIC ISR then this is an external + * interrupt. Either from broken hardware or injected by a VMM. + * + * Note: In guest mode this is only valid for secure guests where + * the secure module fully controls the vAPIC exposed to the guest. + */ + return apic_read(APIC_ISR + offs) & bit; +} + +/** + * int80_emulation - 32-bit legacy syscall entry + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * The arguments for the INT $0x80 based syscall are on stack in the + * pt_regs structure: + * eax: system call number + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 + */ +DEFINE_IDTENTRY_RAW(int80_emulation) +{ + int nr; + + /* Kernel does not use INT $0x80! */ + if (unlikely(!user_mode(regs))) { + irqentry_enter(regs); + instrumentation_begin(); + panic("Unexpected external interrupt 0x80\n"); + } + + /* + * Establish kernel context for instrumentation, including for + * int80_is_external() below which calls into the APIC driver. + * Identical for soft and external interrupts. + */ + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* Validate that this is a soft interrupt to the extent possible */ + if (unlikely(int80_is_external())) + panic("Unexpected external interrupt 0x80\n"); + + /* + * The low level idtentry code pushed -1 into regs::orig_ax + * and regs::ax contains the syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#else /* CONFIG_IA32_EMULATION */ + +/* Handles int $0x80 on a 32bit kernel */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { int nr = syscall_32_enter(regs); @@ -186,6 +276,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); } +#endif /* !CONFIG_IA32_EMULATION */ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 27c05d08558a..de94e2e84ecc 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -275,80 +275,3 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) - -/* - * 32-bit legacy system call entry. - * - * 32-bit x86 Linux system calls traditionally used the INT $0x80 - * instruction. INT $0x80 lands here. - * - * This entry point can be used by 32-bit and 64-bit programs to perform - * 32-bit system calls. Instances of INT $0x80 can be found inline in - * various programs and libraries. It is also used by the vDSO's - * __kernel_vsyscall fallback for hardware that doesn't support a faster - * entry method. Restarted 32-bit system calls also fall back to INT - * $0x80 regardless of what instruction was originally used to do the - * system call. - * - * This is considered a slow path. It is not used by most libc - * implementations on modern hardware except during process startup. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 - */ -SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_ENTRY - ENDBR - /* - * Interrupts are off on entry. - */ - ASM_CLAC /* Do this early to minimize exposure */ - ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV - - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - - /* switch to thread stack expects orig_ax and rdi to be pushed */ - pushq %rax /* pt_regs->orig_ax */ - - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - - movq %rsp, %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp - - pushq 5*8(%rax) /* regs->ss */ - pushq 4*8(%rax) /* regs->rsp */ - pushq 3*8(%rax) /* regs->eflags */ - pushq 2*8(%rax) /* regs->cs */ - pushq 1*8(%rax) /* regs->ip */ - pushq 0*8(%rax) /* regs->orig_ax */ -.Lint80_keep_stack: - - PUSH_AND_CLEAR_REGS rax=$-ENOSYS - UNWIND_HINT_REGS - - cld - - IBRS_ENTER - UNTRAIN_RET - - movq %rsp, %rdi - call do_int80_syscall_32 - jmp swapgs_restore_regs_and_return_to_usermode -SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c8fac5205803..56e6c2f3ee9c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -461,3 +461,5 @@ 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait 456 i386 futex_requeue sys_futex_requeue +457 i386 statmount sys_statmount +458 i386 listmount sys_listmount diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 8cb8bf68721c..3a22eef585c2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -378,6 +378,8 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ce1c777227b4..0f2786d4e405 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4051,12 +4051,17 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable; + /* + * In addition to obeying exclude_guest/exclude_host, remove bits being + * used for PEBS when running a guest, because PEBS writes to virtual + * addresses (not physical addresses). + */ *nr = 0; global_ctrl = (*nr)++; arr[global_ctrl] = (struct perf_guest_switch_msr){ .msr = MSR_CORE_PERF_GLOBAL_CTRL, .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, - .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; if (!x86_pmu.pebs) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 65f79092c9d9..fcd20c6dc7f9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -10,6 +10,9 @@ #define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) #ifndef __ASSEMBLY__ @@ -86,6 +89,8 @@ struct alt_instr { u8 replacementlen; /* length of new instruction */ } __packed; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -101,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); struct module; -struct paravirt_patch_site; struct callthunk_sites { s32 *call_start, *call_end; - struct paravirt_patch_site *pv_start, *pv_end; + struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS @@ -150,6 +154,8 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define ALT_CALL_INSTR "call BUG_func" + #define b_replacement(num) "664"#num #define e_replacement(num) "665"#num @@ -330,6 +336,22 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +/* Macro for creating assembler functions avoiding any C magic. */ +#define DEFINE_ASM_FUNC(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + +void BUG_func(void); +void nop_func(void); + #else /* __ASSEMBLY__ */ #ifdef CONFIG_SMP @@ -370,6 +392,10 @@ static inline int alternatives_text_reserved(void *start, void *end) .byte \alt_len .endm +.macro ALT_CALL_INSTR + call BUG_func +.endm + /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d21f48f1c242..9d159b771dc8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -272,8 +272,6 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - enum apic_delivery_modes delivery_mode; - u32 disable_esr : 1, dest_mode_logical : 1, x2apic_set_max_apicid : 1, diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 4b125e5b3187..094106b6a538 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -20,6 +20,13 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#define APIC_DELIVERY_MODE_FIXED 0 +#define APIC_DELIVERY_MODE_LOWESTPRIO 1 +#define APIC_DELIVERY_MODE_SMI 2 +#define APIC_DELIVERY_MODE_NMI 4 +#define APIC_DELIVERY_MODE_INIT 5 +#define APIC_DELIVERY_MODE_EXTINT 7 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -165,279 +172,10 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) -#ifndef __ASSEMBLY__ -/* - * the local APIC register structure, memory mapped. Not terribly well - * tested, but we might eventually use this one in the future - the - * problem why we cannot use it right now is the P5 APIC, it has an - * errata which cannot take 8-bit reads and writes, only 32-bit ones ... - */ -#define u32 unsigned int - -struct local_apic { - -/*000*/ struct { u32 __reserved[4]; } __reserved_01; - -/*010*/ struct { u32 __reserved[4]; } __reserved_02; - -/*020*/ struct { /* APIC ID Register */ - u32 __reserved_1 : 24, - phys_apic_id : 4, - __reserved_2 : 4; - u32 __reserved[3]; - } id; - -/*030*/ const - struct { /* APIC Version Register */ - u32 version : 8, - __reserved_1 : 8, - max_lvt : 8, - __reserved_2 : 8; - u32 __reserved[3]; - } version; - -/*040*/ struct { u32 __reserved[4]; } __reserved_03; - -/*050*/ struct { u32 __reserved[4]; } __reserved_04; - -/*060*/ struct { u32 __reserved[4]; } __reserved_05; - -/*070*/ struct { u32 __reserved[4]; } __reserved_06; - -/*080*/ struct { /* Task Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } tpr; - -/*090*/ const - struct { /* Arbitration Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } apr; - -/*0A0*/ const - struct { /* Processor Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } ppr; - -/*0B0*/ struct { /* End Of Interrupt Register */ - u32 eoi; - u32 __reserved[3]; - } eoi; - -/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; - -/*0D0*/ struct { /* Logical Destination Register */ - u32 __reserved_1 : 24, - logical_dest : 8; - u32 __reserved_2[3]; - } ldr; - -/*0E0*/ struct { /* Destination Format Register */ - u32 __reserved_1 : 28, - model : 4; - u32 __reserved_2[3]; - } dfr; - -/*0F0*/ struct { /* Spurious Interrupt Vector Register */ - u32 spurious_vector : 8, - apic_enabled : 1, - focus_cpu : 1, - __reserved_2 : 22; - u32 __reserved_3[3]; - } svr; - -/*100*/ struct { /* In Service Register */ -/*170*/ u32 bitfield; - u32 __reserved[3]; - } isr [8]; - -/*180*/ struct { /* Trigger Mode Register */ -/*1F0*/ u32 bitfield; - u32 __reserved[3]; - } tmr [8]; - -/*200*/ struct { /* Interrupt Request Register */ -/*270*/ u32 bitfield; - u32 __reserved[3]; - } irr [8]; - -/*280*/ union { /* Error Status Register */ - struct { - u32 send_cs_error : 1, - receive_cs_error : 1, - send_accept_error : 1, - receive_accept_error : 1, - __reserved_1 : 1, - send_illegal_vector : 1, - receive_illegal_vector : 1, - illegal_register_address : 1, - __reserved_2 : 24; - u32 __reserved_3[3]; - } error_bits; - struct { - u32 errors; - u32 __reserved_3[3]; - } all_errors; - } esr; - -/*290*/ struct { u32 __reserved[4]; } __reserved_08; - -/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; - -/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; - -/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; - -/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; - -/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; - -/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; - -/*300*/ struct { /* Interrupt Command Register 1 */ - u32 vector : 8, - delivery_mode : 3, - destination_mode : 1, - delivery_status : 1, - __reserved_1 : 1, - level : 1, - trigger : 1, - __reserved_2 : 2, - shorthand : 2, - __reserved_3 : 12; - u32 __reserved_4[3]; - } icr1; - -/*310*/ struct { /* Interrupt Command Register 2 */ - union { - u32 __reserved_1 : 24, - phys_dest : 4, - __reserved_2 : 4; - u32 __reserved_3 : 24, - logical_dest : 8; - } dest; - u32 __reserved_4[3]; - } icr2; - -/*320*/ struct { /* LVT - Timer */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - timer_mode : 1, - __reserved_3 : 14; - u32 __reserved_4[3]; - } lvt_timer; - -/*330*/ struct { /* LVT - Thermal Sensor */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_thermal; - -/*340*/ struct { /* LVT - Performance Counter */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_pc; - -/*350*/ struct { /* LVT - LINT0 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint0; - -/*360*/ struct { /* LVT - LINT1 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint1; - -/*370*/ struct { /* LVT - Error */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_error; - -/*380*/ struct { /* Timer Initial Count Register */ - u32 initial_count; - u32 __reserved_2[3]; - } timer_icr; - -/*390*/ const - struct { /* Timer Current Count Register */ - u32 curr_count; - u32 __reserved_2[3]; - } timer_ccr; - -/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; - -/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; - -/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; - -/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; - -/*3E0*/ struct { /* Timer Divide Configuration Register */ - u32 divisor : 4, - __reserved_1 : 28; - u32 __reserved_2[3]; - } timer_dcr; - -/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; - -} __attribute__ ((packed)); - -#undef u32 - #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else #define BAD_APICID 0xFFFFu #endif -enum apic_delivery_modes { - APIC_DELIVERY_MODE_FIXED = 0, - APIC_DELIVERY_MODE_LOWESTPRIO = 1, - APIC_DELIVERY_MODE_SMI = 2, - APIC_DELIVERY_MODE_NMI = 4, - APIC_DELIVERY_MODE_INIT = 5, - APIC_DELIVERY_MODE_EXTINT = 7, -}; - -#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..0216f63a366b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -81,22 +81,4 @@ do { \ #include <asm-generic/barrier.h> -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4af140cf5719..632c26cdeeda 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -308,10 +308,14 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ #define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ - #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index f7e7099af595..d440a65af8f3 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -8,6 +8,56 @@ * archs. */ +/* + * Low-level interface mapping flags/field names to bits + */ + +/* Flags for _DESC_S (non-system) descriptors */ +#define _DESC_ACCESSED 0x0001 +#define _DESC_DATA_WRITABLE 0x0002 +#define _DESC_CODE_READABLE 0x0002 +#define _DESC_DATA_EXPAND_DOWN 0x0004 +#define _DESC_CODE_CONFORMING 0x0004 +#define _DESC_CODE_EXECUTABLE 0x0008 + +/* Common flags */ +#define _DESC_S 0x0010 +#define _DESC_DPL(dpl) ((dpl) << 5) +#define _DESC_PRESENT 0x0080 + +#define _DESC_LONG_CODE 0x2000 +#define _DESC_DB 0x4000 +#define _DESC_GRANULARITY_4K 0x8000 + +/* System descriptors have a numeric "type" field instead of flags */ +#define _DESC_SYSTEM(code) (code) + +/* + * High-level interface mapping intended usage to low-level combinations + * of flags + */ + +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_DATA_WRITABLE) +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) + +#define DESC_DATA16 (_DESC_DATA) +#define DESC_CODE16 (_DESC_CODE) + +#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB) + +#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB) + +#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT) + +#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE) + +#define DESC_USER (_DESC_DPL(3)) + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -22,19 +72,19 @@ struct desc_struct { #define GDT_ENTRY_INIT(flags, base, limit) \ { \ - .limit0 = (u16) (limit), \ - .limit1 = ((limit) >> 16) & 0x0F, \ - .base0 = (u16) (base), \ - .base1 = ((base) >> 16) & 0xFF, \ - .base2 = ((base) >> 24) & 0xFF, \ - .type = (flags & 0x0f), \ - .s = (flags >> 4) & 0x01, \ - .dpl = (flags >> 5) & 0x03, \ - .p = (flags >> 7) & 0x01, \ - .avl = (flags >> 12) & 0x01, \ - .l = (flags >> 13) & 0x01, \ - .d = (flags >> 14) & 0x01, \ - .g = (flags >> 15) & 0x01, \ + .limit0 = ((limit) >> 0) & 0xFFFF, \ + .limit1 = ((limit) >> 16) & 0x000F, \ + .base0 = ((base) >> 0) & 0xFFFF, \ + .base1 = ((base) >> 16) & 0x00FF, \ + .base2 = ((base) >> 24) & 0x00FF, \ + .type = ((flags) >> 0) & 0x000F, \ + .s = ((flags) >> 4) & 0x0001, \ + .dpl = ((flags) >> 5) & 0x0003, \ + .p = ((flags) >> 7) & 0x0001, \ + .avl = ((flags) >> 12) & 0x0001, \ + .l = ((flags) >> 13) & 0x0001, \ + .d = ((flags) >> 14) & 0x0001, \ + .g = ((flags) >> 15) & 0x0001, \ } enum { @@ -94,6 +144,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; +#ifndef _SETUP static inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 @@ -108,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g) { return g->segment; } +#endif struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index a0234dfd1031..1e16bd5ac781 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -150,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - ((elf_check_arch_ia32(x) && ia32_enabled()) || \ + ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 5a2ae24b1204..c7ef6ea2fa99 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> @@ -75,6 +74,11 @@ static inline bool ia32_enabled(void) return __ia32_enabled; } +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + #else /* !CONFIG_IA32_EMULATION */ static inline bool ia32_enabled(void) @@ -82,6 +86,18 @@ static inline bool ia32_enabled(void) return IS_ENABLED(CONFIG_X86_32); } +static inline void ia32_disable(void) {} + #endif +static inline bool ia32_enabled_verbose(void) +{ + bool enabled = ia32_enabled(); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) + pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); + + return enabled; +} + #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 05fd175cec7d..13639e57e1f8 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6de6e1d95952..de3118305838 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -311,6 +311,7 @@ enum smca_bank_types { SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_PSP_V2, @@ -326,6 +327,8 @@ enum smca_bank_types { SMCA_SHUB, /* System HUB Unit */ SMCA_SATA, /* SATA Unit */ SMCA_USB, /* USB Unit */ + SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ + SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ @@ -333,7 +336,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c8ff12140ae..8bcf7584e7dd 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -142,8 +142,7 @@ static inline void write_cr0(unsigned long x) static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, - "mov %%cr2, %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) @@ -154,13 +153,12 @@ static __always_inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, - "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { - PVOP_ALT_VCALL1(mmu.write_cr3, x, - "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) @@ -182,7 +180,7 @@ extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) @@ -390,27 +388,25 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -444,14 +440,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -464,7 +459,7 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } @@ -472,7 +467,7 @@ static inline pud_t __pud(pudval_t val) static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) @@ -492,8 +487,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } @@ -501,7 +495,7 @@ static inline p4d_t __p4d(p4dval_t val) static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -687,17 +681,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { - PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { - PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) @@ -726,52 +720,25 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 -#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ - asm (".pushsection " #sec ", \"ax\"\n" \ - ".global " #func "\n\t" \ - ".type " #func ", @function\n\t" \ - ASM_FUNC_ALIGN "\n" \ - #func ":\n\t" \ - ASM_ENDBR \ - instr "\n\t" \ - ASM_RET \ - ".size " #func ", . - " #func "\n\t" \ - ".popsection") - extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, ops, word, algn) \ -771:; \ - ops; \ -772:; \ - .pushsection .parainstructions,"a"; \ - .align algn; \ - word 771b; \ - .byte ptype; \ - .byte 772b-771b; \ - _ASM_ALIGN; \ - .popsection - - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY -#define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), - ANNOTATE_RETPOLINE_SAFE; - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm -#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT(X86_FEATURE_XENPV) +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ + "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ + "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 772d03487520..d8e85d2cf8d5 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,15 +2,6 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -#ifndef __ASSEMBLY__ -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; -#endif - #ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ @@ -250,43 +241,11 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "m" (pv_ops.op) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - _ASM_ALIGN "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]") - -/* Simple instruction patching code. */ -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" - -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); +#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) int paravirt_disable_iospace(void); -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ +/* This generates an indirect call based on the operation type number. */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" @@ -319,12 +278,6 @@ int paravirt_disable_iospace(void); * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * * Unfortunately there's no way to get gcc to generate the args setup * for the call, and then allow the call itself to be generated by an * inline asm. Because of this, we must do the complete arg setup and @@ -423,14 +376,27 @@ int paravirt_disable_iospace(void); __mask & __eax; \ }) - +/* + * Use alternative patching for paravirt calls: + * - For replacing an indirect call with a direct one, use the "normal" + * ALTERNATIVE() macro with the indirect call as the initial code sequence, + * which will be replaced with the related direct call by using the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + * - In case the replacement is either a direct call or a short code sequence + * depending on a feature bit, the ALTERNATIVE_2() macro is being used. + * The indirect call is the initial code sequence again, while the special + * code sequence is selected with the specified feature bit. In case the + * feature is not active, the direct call is used as above via the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + */ #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ + asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \ + ALT_CALL_ALWAYS) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -441,10 +407,11 @@ int paravirt_disable_iospace(void); ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ - alt, cond) \ + asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \ + ALT_CALL_INSTR, ALT_CALL_ALWAYS, \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -542,8 +509,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -void _paravirt_nop(void); -void paravirt_BUG(void); unsigned long paravirt_ret0(void); #ifdef CONFIG_PARAVIRT_XXL u64 _paravirt_ident_64(u64); @@ -553,11 +518,11 @@ void pv_native_irq_enable(void); unsigned long pv_native_read_cr2(void); #endif -#define paravirt_nop ((void *)_paravirt_nop) - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; +#define paravirt_nop ((void *)nop_func) #endif /* __ASSEMBLY__ */ + +#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) + #endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ae81a7191c1c..26620d7642a9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -749,4 +749,22 @@ enum mds_mitigations { extern bool gds_ucode_mitigated(void); +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 4d84122bd643..484f4f0131a5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -32,10 +32,6 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif #else /* !CONFIG_IA32_EMULATION */ #define entry_SYSCALL_compat NULL #define entry_SYSENTER_compat NULL diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 85b6e3609cb9..ef9697f20129 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "pop %rdx\n\t" \ FRAME_END -DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, - PV_UNLOCK_ASM, .spinlock.text); +DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index fd2669b1cb2d..21f9407be5d3 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 29832c338cdc..0b70653a98c1 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -6,18 +6,6 @@ #include <linux/stddef.h> #include <asm/ptrace.h> -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1a0dd80d81ac..85a3ce2a3666 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -293,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); + has_lapic_cpus = true; return 0; } @@ -1134,7 +1135,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!count) { count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic, MAX_LOCAL_APIC); - has_lapic_cpus = count > 0; x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, acpi_parse_x2apic, MAX_LOCAL_APIC); } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 73be3931e4f0..95e21596e2f9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -160,7 +160,6 @@ extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -255,6 +254,16 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) } } +static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) +{ + unsigned long flags; + + local_irq_save(flags); + optimize_nops(instr, len); + sync_core(); + local_irq_restore(flags); +} + /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the @@ -385,6 +394,63 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) } } +/* Low-level backend functions usable from alternative code replacements. */ +DEFINE_ASM_FUNC(nop_func, "", .entry.text); +EXPORT_SYMBOL_GPL(nop_func); + +noinstr void BUG_func(void) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(BUG_func); + +#define CALL_RIP_REL_OPCODE 0xff +#define CALL_RIP_REL_MODRM 0x15 + +/* + * Rewrite the "call BUG_func" replacement to point to the target of the + * indirect pv_ops call "call *disp(%ip)". + */ +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +{ + void *target, *bug = &BUG_func; + s32 disp; + + if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { + pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); + BUG(); + } + + if (a->instrlen != 6 || + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { + pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); + BUG(); + } + + /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ + disp = *(s32 *)(instr + 2); +#ifdef CONFIG_X86_64 + /* ff 15 00 00 00 00 call *0x0(%rip) */ + /* target address is stored at "next instruction + disp". */ + target = *(void **)(instr + a->instrlen + disp); +#else + /* ff 15 00 00 00 00 call *0x0 */ + /* target address is stored at disp. */ + target = *(void **)disp; +#endif + if (!target) + target = bug; + + /* (BUG_func - .) + (target - BUG_func) := target - . */ + *(s32 *)(insn_buff + 1) += target - bug; + + if (target == &nop_func) + return 0; + + return 5; +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -438,20 +504,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - optimize_nops(instr, a->instrlen); + optimize_nops_inplace(instr, a->instrlen); continue; } - DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", - (a->flags & ALT_FLAG_NOT) ? "!" : "", + DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen); + replacement, a->replacementlen, a->flags); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; + if (a->flags & ALT_FLAG_DIRECT_CALL) { + insn_buff_sz = alt_replace_call(instr, insn_buff, a); + if (insn_buff_sz < 0) + continue; + } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; @@ -1411,46 +1482,6 @@ int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#ifdef CONFIG_PARAVIRT - -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) -{ - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; - } -} - -void __init_or_module apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{ - struct paravirt_patch_site *p; - char insn_buff[MAX_PATCH_LEN]; - - for (p = start; p < end; p++) { - unsigned int used; - - BUG_ON(p->len > MAX_PATCH_LEN); - /* prep the buffer with the original instructions */ - memcpy(insn_buff, p->instr, p->len); - used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); - - BUG_ON(used > p->len); - - /* Pad the rest with nops */ - add_nops(insn_buff + used, p->len - used); - text_poke_early(p->instr, insn_buff, p->len); - } -} -extern struct paravirt_patch_site __start_parainstructions[], - __stop_parainstructions[]; -#endif /* CONFIG_PARAVIRT */ - /* * Self-test for the INT3 based CALL emulation code. * @@ -1586,28 +1617,11 @@ void __init alternative_instructions(void) */ /* - * Paravirt patching and alternative patching can be combined to - * replace a function call with a short direct code sequence (e.g. - * by setting a constant return value instead of doing that in an - * external function). - * In order to make this work the following sequence is required: - * 1. set (artificial) features depending on used paravirt - * functions which can later influence alternative patching - * 2. apply paravirt patching (generally replacing an indirect - * function call with a direct one) - * 3. apply alternative patching (e.g. replacing a direct function - * call with a custom code sequence) - * Doing paravirt patching after alternative patching would clobber - * the optimization of the custom code with a function call again. + * Make sure to set (artificial) features depending on used paravirt + * functions which can later influence alternative patching. */ paravirt_set_cap(); - /* - * First patch paravirt functions, such that we overwrite the indirect - * call with the direct call. - */ - apply_paravirt(__parainstructions, __parainstructions_end); - __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -1618,10 +1632,6 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); - /* - * Then patch alternatives, such that those paravirt calls that are in - * alternatives can be overwritten by their immediate fragments. - */ apply_alternatives(__alt_instructions, __alt_instructions_end); /* @@ -1685,8 +1695,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } else { local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7139867d69cd..b295a056a4fc 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -82,7 +82,6 @@ static struct apic apic_flat __ro_after_init = { .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, @@ -154,7 +153,6 @@ static struct apic apic_physflat __ro_after_init = { .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b00d52ae84fa..9f1d553eb48f 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -47,7 +47,6 @@ static void noop_apic_write(u32 reg, u32 val) struct apic apic_noop __ro_after_init = { .name = "noop", - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 456a14c44f67..7d0c51b9d3bc 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -222,7 +222,6 @@ static const struct apic apic_numachip1 __refconst = { .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, @@ -259,7 +258,6 @@ static const struct apic apic_numachip2 __refconst = { .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 7ee3c486cb33..5a0d60b38e6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -80,7 +80,6 @@ static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 1, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 00da6cf6b07d..40c7cf180c20 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -997,7 +997,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to * the pin list associated with this IRQ and program the IOAPIC - * entry. The IOAPIC entry + * entry. */ if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 5eb3fbe472da..c0f78059f06a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -45,7 +45,6 @@ static struct apic apic_default __ro_after_init = { .probe = probe_default, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a8306089c91b..28a7d3f2312d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -227,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 558a4a8824f4..409815a40668 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -145,7 +145,6 @@ static struct apic apic_x2apic_phys __ro_after_init = { .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1b0d7336a28f..f1766b18dcd0 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -805,7 +805,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5934ee5bc087..76a5ced278c2 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex); * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS, (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index e9ad518a5003..64ad2ddea121 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -233,14 +233,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) } static __init_or_module void -patch_paravirt_call_sites(struct paravirt_patch_site *start, - struct paravirt_patch_site *end, - const struct core_text *ct) +patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, + const struct core_text *ct) { - struct paravirt_patch_site *p; + struct alt_instr *a; - for (p = start; p < end; p++) - patch_call(p->instr, ct); + for (a = start; a < end; a++) + patch_call((void *)&a->instr_offset + a->instr_offset, ct); } static __init_or_module void @@ -248,7 +247,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -257,8 +256,8 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .pv_start = __parainstructions, - .pv_end = __parainstructions_end + .alt_start = __alt_instructions, + .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a7eab05e5f29..9f42d1c59e09 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -34,87 +34,6 @@ */ static u32 nodes_per_socket = 1; -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - -static const int amd_zenbleed[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), - AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), - AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), - AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); - -static const int amd_div0[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), - AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); - -static const int amd_erratum_1485[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), - AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -616,6 +535,49 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } resctrl_cpu_detect(c); + + /* Figure out Zen generations: */ + switch (c->x86) { + case 0x17: { + switch (c->x86_model) { + case 0x00 ... 0x2f: + case 0x50 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN1); + break; + case 0x30 ... 0x4f: + case 0x60 ... 0x7f: + case 0x90 ... 0x91: + case 0xa0 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN2); + break; + default: + goto warn; + } + break; + } + case 0x19: { + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN3); + break; + case 0x10 ... 0x1f: + case 0x60 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN4); + break; + default: + goto warn; + } + break; + } + default: + break; + } + + return; + +warn: + WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -739,15 +701,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); - /* - * Check whether the machine is affected by erratum 400. This is - * used to select the proper idle routine and to enable the check - * whether the machine is affected in arch_post_acpi_init(), which - * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. - */ - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_E400); - early_detect_mem_encrypt(c); /* Re-enable TopologyExtensions if switched off by BIOS */ @@ -814,6 +767,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, 6); #endif set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x41 || + (c->x86_model == 0x41 && c->x86_stepping >= 0x2)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -847,8 +810,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c) */ msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - if (cpu_has_amd_erratum(c, amd_erratum_383)) - set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x2 || + (c->x86_model == 0x2 && c->x86_stepping >= 0x1)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_ln(struct cpuinfo_x86 *c) @@ -941,6 +913,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static void fix_erratum_1386(struct cpuinfo_x86 *c) +{ + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + void init_spectral_chicken(struct cpuinfo_x86 *c) { #ifdef CONFIG_CPU_UNRET_ENTRY @@ -951,34 +936,28 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * * This suppresses speculation from the middle of a basic block, i.e. it * suppresses non-branch predictions. - * - * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif - /* - * Work around Erratum 1386. The XSAVES instruction malfunctions in - * certain circumstances on Zen1/2 uarch, and not all parts have had - * updated microcode at the time of writing (March 2023). - * - * Affected parts all have no supervisor XSAVE states, meaning that - * the XSAVEC instruction (which works fine) is equivalent. - */ - clear_cpu_cap(c, X86_FEATURE_XSAVES); } -static void init_amd_zn(struct cpuinfo_x86 *c) +static void init_amd_zen_common(void) { - set_cpu_cap(c, X86_FEATURE_ZEN); - + setup_force_cpu_cap(X86_FEATURE_ZEN); #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif +} + +static void init_amd_zen1(struct cpuinfo_x86 *c) +{ + init_amd_zen_common(); + fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { @@ -986,15 +965,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c) /* Erratum 1076: CPB feature bit not being set in CPUID. */ if (!cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); - - /* - * Zen3 (Fam19 model < 0x10) parts are not susceptible to - * Branch Type Confusion, but predate the allocation of the - * BTC_NO bit. - */ - if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) - set_cpu_cap(c, X86_FEATURE_BTC_NO); } + + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); } static bool cpu_has_zenbleed_microcode(void) @@ -1018,11 +992,8 @@ static bool cpu_has_zenbleed_microcode(void) return true; } -static void zenbleed_check(struct cpuinfo_x86 *c) +static void zen2_zenbleed_check(struct cpuinfo_x86 *c) { - if (!cpu_has_amd_erratum(c, amd_zenbleed)) - return; - if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return; @@ -1037,6 +1008,37 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } +static void init_amd_zen2(struct cpuinfo_x86 *c) +{ + init_amd_zen_common(); + init_spectral_chicken(c); + fix_erratum_1386(c); + zen2_zenbleed_check(c); +} + +static void init_amd_zen3(struct cpuinfo_x86 *c) +{ + init_amd_zen_common(); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (!cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } +} + +static void init_amd_zen4(struct cpuinfo_x86 *c) +{ + init_amd_zen_common(); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); +} + static void init_amd(struct cpuinfo_x86 *c) { u64 vm_cr; @@ -1072,11 +1074,17 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_spectral_chicken(c); - fallthrough; - case 0x19: init_amd_zn(c); break; } + if (boot_cpu_has(X86_FEATURE_ZEN1)) + init_amd_zen1(c); + else if (boot_cpu_has(X86_FEATURE_ZEN2)) + init_amd_zen2(c); + else if (boot_cpu_has(X86_FEATURE_ZEN3)) + init_amd_zen3(c); + else if (boot_cpu_has(X86_FEATURE_ZEN4)) + init_amd_zen4(c); + /* * Enable workaround for FXSAVE leak on CPUs * without a XSaveErPtr feature @@ -1136,7 +1144,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - !cpu_has_amd_erratum(c, amd_erratum_1054)) + (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); @@ -1152,16 +1160,8 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - zenbleed_check(c); - - if (cpu_has_amd_erratum(c, amd_div0)) { - pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); - setup_force_cpu_bug(X86_BUG_DIV0); - } - - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && - cpu_has_amd_erratum(c, amd_erratum_1485)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } #ifdef CONFIG_X86_32 @@ -1315,11 +1315,14 @@ static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - zenbleed_check(c); + zen2_zenbleed_check(c); } void amd_check_microcode(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + on_each_cpu(zenbleed_check_cpu, NULL, 1); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b14fc8c1c953..94bff381ef20 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -188,45 +188,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - - [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -1856,6 +1848,13 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); #endif + + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6f247d66758d..f0cd95502faa 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -354,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index e4c3ba91321c..f18d35fe27a9 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -237,4 +237,4 @@ err_out_online: cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); return ret; } -subsys_initcall(intel_epb_init); +late_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index f3517b8a8e91..2b46eb0fdf3a 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -87,42 +87,40 @@ struct smca_bank { static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -static struct smca_bank_name smca_names[] = { - [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, +static const char * const smca_names[] = { + [SMCA_LS ... SMCA_LS_V2] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [SMCA_RESERVED] = "reserved", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", + [SMCA_CS ... SMCA_CS_V2] = "coherent_slave", + [SMCA_PIE] = "pie", /* UMC v2 is separate because both of them can exist in a single system. */ - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, - [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, - [SMCA_NBIF] = { "nbif", "NBIF Unit" }, - [SMCA_SHUB] = { "shub", "System Hub Unit" }, - [SMCA_SATA] = { "sata", "SATA Unit" }, - [SMCA_USB] = { "usb", "USB Unit" }, - [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, - [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, - [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, - [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, + [SMCA_UMC] = "umc", + [SMCA_UMC_V2] = "umc_v2", + [SMCA_MA_LLC] = "ma_llc", + [SMCA_PB] = "param_block", + [SMCA_PSP ... SMCA_PSP_V2] = "psp", + [SMCA_SMU ... SMCA_SMU_V2] = "smu", + [SMCA_MP5] = "mp5", + [SMCA_MPDMA] = "mpdma", + [SMCA_NBIO] = "nbio", + [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie", + [SMCA_XGMI_PCS] = "xgmi_pcs", + [SMCA_NBIF] = "nbif", + [SMCA_SHUB] = "shub", + [SMCA_SATA] = "sata", + [SMCA_USB] = "usb", + [SMCA_USR_DP] = "usr_dp", + [SMCA_USR_CP] = "usr_cp", + [SMCA_GMI_PCS] = "gmi_pcs", + [SMCA_XGMI_PHY] = "xgmi_phy", + [SMCA_WAFL_PHY] = "wafl_phy", + [SMCA_GMI_PHY] = "gmi_phy", }; static const char *smca_get_name(enum smca_bank_types t) @@ -130,17 +128,8 @@ static const char *smca_get_name(enum smca_bank_types t) if (t >= N_SMCA_BANK_TYPES) return NULL; - return smca_names[t].name; -} - -const char *smca_get_long_name(enum smca_bank_types t) -{ - if (t >= N_SMCA_BANK_TYPES) - return NULL; - - return smca_names[t].long_name; + return smca_names[t]; } -EXPORT_SYMBOL_GPL(smca_get_long_name); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { @@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, + { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) }, /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, @@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) }, + { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) }, { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7b397370b4d6..fd5ce12c4f9a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,6 +44,7 @@ #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> +#include <linux/kexec.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + struct page *p; /* * Allow instrumentation around external facilities usage. Not that it @@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -670,6 +686,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + /* + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. + */ + if (!mca_cfg.cmci_disabled) + mce_track_storm(&m); + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; @@ -1601,13 +1627,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1637,15 +1656,9 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) mc_poll_banks(); - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. @@ -1655,23 +1668,29 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1995,7 +2014,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -2008,7 +2026,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: { @@ -2568,9 +2585,6 @@ static int mce_device_create(unsigned int cpu) int err; int i, j; - if (!mce_available(&boot_cpu_data)) - return -EIO; - dev = per_cpu(mce_device, cpu); if (dev) return 0; @@ -2665,8 +2679,6 @@ static void mce_reenable_cpu(void) static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 4d8d4bcf915d..72f0695c3dc1 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -746,6 +746,7 @@ static void check_hw_inj_possible(void) wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 52bce533ddcc..399b62e223d2 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -42,15 +42,6 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - -/* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ @@ -63,22 +54,26 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); */ static DEFINE_SPINLOCK(cmci_poll_lock); +/* Linux non-storm CMCI threshold (may be overridden by BIOS) */ #define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; +/* + * MCi_CTL2 threshold for each bank when there is no storm. + * Default value for each bank may have been set by BIOS. + */ +static u16 cmci_threshold[MAX_NR_BANKS]; -static atomic_t cmci_storm_on_cpus; +/* + * High threshold to limit CMCI rate during storms. Max supported is + * 0x7FFF. Use this slightly smaller value so it has a distinctive + * signature when some asks "Why am I not seeing all corrected errors?" + * A high threshold is used instead of just disabling CMCI for a + * bank because both corrected and uncorrected errors may be logged + * in the same bank and signalled with CMCI. The threshold only applies + * to corrected errors, so keeping CMCI enabled means that uncorrected + * errors will still be processed in a timely fashion. + */ +#define CMCI_STORM_THRESHOLD 32749 static int cmci_supported(int *banks) { @@ -134,204 +129,166 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } -bool mce_intel_cmci_poll(void) +/* + * Set a new CMCI threshold value. Preserve the state of the + * MCI_CTL2_CMCI_EN bit in case this happens during a + * cmci_rediscover() operation. + */ +static void cmci_set_threshold(int bank, int thresh) { - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); + unsigned long flags; + u64 val; - return true; + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -void mce_intel_hcpu_update(unsigned long cpu) +void mce_intel_handle_storm(int bank, bool on) { - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); + if (on) + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); + else + cmci_set_threshold(bank, cmci_threshold[bank]); +} - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } -static void cmci_toggle_interrupt_mode(bool on) +/* + * Check all the reasons why current CPU cannot claim + * ownership of a bank. + * 1: CPU already owns this bank + * 2: BIOS owns this bank + * 3: Some other CPU owns this bank + */ +static bool cmci_skip_bank(int bank, u64 *val) { - unsigned long flags, *owned; - int bank; - u64 val; + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + if (test_bit(bank, owned)) + return true; - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; + /* Skip banks in firmware first mode */ + if (test_bit(bank, mce_banks_ce_disabled)) + return true; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} + rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; + /* Already owned by someone else? */ + if (*val & MCI_CTL2_CMCI_EN) { + clear_bit(bank, owned); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + return true; } - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + return false; +} - fallthrough; +/* + * Decide which CMCI interrupt threshold to use: + * 1: If this bank is in storm mode from whichever CPU was + * the previous owner, stay in storm mode. + * 2: If ignoring any threshold set by BIOS, set Linux default + * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero). + */ +static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh) +{ + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + return val; - case CMCI_STORM_SUBSIDED: + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; + *bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; } + + return val; } -static bool cmci_storm_detect(void) +/* + * Try to claim ownership of a bank. + */ +static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh) { - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); + /* If the enable bit did not stick, this bank should be polled. */ + if (!(val & MCI_CTL2_CMCI_EN)) { + WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks))); + storm->banks[bank].poll_only = true; + return; } - __this_cpu_write(cmci_storm_cnt, cnt); - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + /* This CPU successfully set the enable bit. */ + set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned)); - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) { + pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank); + mce_inherit_storm(bank); + cmci_storm_begin(bank); + } else { + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + } -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - if (cmci_storm_detect()) - return; + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + *bios_wrong_thresh = 1; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + /* Save default threshold for each bank */ + if (cmci_threshold[bank] == 0) + cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK; } /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. + * banks. Called during initial bootstrap, and also for hotplug CPU operations + * to rediscover/reassign machine check banks. */ static void cmci_discover(int banks) { - unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + int bios_wrong_thresh = 0; unsigned long flags; int i; - int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; - if (test_bit(i, owned)) + if (cmci_skip_bank(i, &val)) continue; - /* Skip banks in firmware first mode */ - if (test_bit(i, mce_banks_ce_disabled)) - continue; - - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Already owned by someone else? */ - if (val & MCI_CTL2_CMCI_EN) { - clear_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - continue; - } - - if (!mca_cfg.bios_cmci_threshold) { - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= CMCI_THRESHOLD; - } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { - /* - * If bios_cmci_threshold boot option was specified - * but the threshold is zero, we'll try to initialize - * it to 1. - */ - bios_zero_thresh = 1; - val |= CMCI_THRESHOLD; - } - - val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & MCI_CTL2_CMCI_EN) { - set_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - /* - * We are able to set thresholds for some banks that - * had a threshold of 0. This means the BIOS has not - * set the thresholds properly or does not work with - * this boot option. Note down now and report later. - */ - if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && - (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) - bios_wrong_thresh = 1; - } else { - WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); - } + val = cmci_pick_threshold(val, &bios_zero_thresh); + cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh); } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { @@ -370,6 +327,9 @@ static void __cmci_disable_bank(int bank) val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); + + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + cmci_storm_end(bank); } /* diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index e13a26c9c0ac..01f8f03969e6 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); +void mce_intel_handle_storm(int bank, bool on); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); @@ -51,9 +49,7 @@ void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); bool intel_mce_usable_address(struct mce *m); #else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void mce_intel_handle_storm(int bank, bool on) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } @@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; } static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif -void mce_timer_kick(unsigned long interval); +void mce_timer_kick(bool storm); + +#ifdef CONFIG_X86_MCE_THRESHOLD +void cmci_storm_begin(unsigned int bank); +void cmci_storm_end(unsigned int bank); +void mce_track_storm(struct mce *mce); +void mce_inherit_storm(unsigned int bank); +bool mce_get_storm_mode(void); +void mce_set_storm_mode(bool storm); +#else +static inline void cmci_storm_begin(unsigned int bank) {} +static inline void cmci_storm_end(unsigned int bank) {} +static inline void mce_track_storm(struct mce *mce) {} +static inline void mce_inherit_storm(unsigned int bank) {} +static inline bool mce_get_storm_mode(void) { return false; } +static inline void mce_set_storm_mode(bool storm) {} +#endif + +/* + * history: Bitmask tracking errors occurrence. Each set bit + * represents an error seen. + * + * timestamp: Last time (in jiffies) that the bank was polled. + * in_storm_mode: Is this bank in storm mode? + * poll_only: Bank does not support CMCI, skip storm tracking. + */ +struct storm_bank { + u64 history; + u64 timestamp; + bool in_storm_mode; + bool poll_only; +}; + +#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE) + +/* How many errors within the history buffer mark the start of a storm. */ +#define STORM_BEGIN_THRESHOLD 5 + +/* + * How many polls of machine check bank without an error before declaring + * the storm is over. Since it is tracked by the bitmasks in the history + * field of struct storm_bank the mask is 30 bits [0 ... 29]. + */ +#define STORM_END_POLL_THRESHOLD 29 + +/* + * banks: per-cpu, per-bank details + * stormy_bank_count: count of MC banks in storm state + * poll_mode: CPU is in poll mode + */ +struct mca_storm_desc { + struct storm_bank banks[MAX_NR_BANKS]; + u8 stormy_bank_count; + bool poll_mode; +}; + +DECLARE_PER_CPU(struct mca_storm_desc, storm_desc); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd88..89e31e1e5c9c 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_handle_storm(bank, on); + break; + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 070426b9895f..857e608af641 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -370,14 +370,14 @@ static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info * { struct cpio_data cp; + intel_collect_cpu_info(&uci->cpu_sig); + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); if (!(cp.data && cp.size)) return NULL; - intel_collect_cpu_info(&uci->cpu_sig); - return scan_microcode(cp.data, cp.size, uci, save); } @@ -410,13 +410,13 @@ void __init load_ucode_intel_bsp(struct early_load_data *ed) { struct ucode_cpu_info uci; - ed->old_rev = intel_get_microcode_revision(); - uci.mc = get_microcode_blob(&uci, false); - if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) - ucode_patch_va = UCODE_BSP_LOADED; + ed->old_rev = uci.cpu_sig.rev; - ed->new_rev = uci.cpu_sig.rev; + if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) { + ucode_patch_va = UCODE_BSP_LOADED; + ed->new_rev = uci.cpu_sig.rev; + } } void load_ucode_intel_ap(void) @@ -457,12 +457,6 @@ static enum ucode_state apply_microcode_late(int cpu) if (ret != UCODE_UPDATED && ret != UCODE_OK) return ret; - if (!cpu && uci->cpu_sig.rev != cur_rev) { - pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n", - uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); - } - cpu_data(cpu).microcode = uci->cpu_sig.rev; if (!cpu) boot_cpu_data.microcode = uci->cpu_sig.rev; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 05a110c97111..dc0956067944 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -71,9 +71,9 @@ EXPORT_SYMBOL(vmemmap_base); * GDT used on the boot CPU before switching to virtual addresses. */ static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), }; /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 086a2c3aaaa0..f479e5e793a2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -114,6 +114,28 @@ SYM_CODE_START_NOALIGN(startup_64) /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + mov %rax, %rdi + mov %rax, %r14 + + addq phys_base(%rip), %rdi + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + */ + call sev_verify_cbit + + /* + * Restore CR3 value without the phys_base which will be added + * below, before writing %cr3. + */ + mov %r14, %rax +#endif + jmp 1f SYM_CODE_END(startup_64) @@ -193,15 +215,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) addq phys_base(%rip), %rax /* - * For SEV guests: Verify that the C-bit is correct. A malicious - * hypervisor could lie about the C-bit position to perform a ROP - * attack on the guest by writing to the unencrypted stack and wait for - * the next RET instruction. - */ - movq %rax, %rdi - call sev_verify_cbit - - /* * Switch to new page-table * * For the boot CPU this switches to early_top_pgt which still has the @@ -255,6 +268,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) testl $X2APIC_ENABLE, %eax jnz .Lread_apicid_msr +#ifdef CONFIG_X86_X2APIC + /* + * If system is in X2APIC mode then MMIO base might not be + * mapped causing the MMIO read below to fault. Faults can't + * be handled at that point. + */ + cmpl $0, x2apic_mode(%rip) + jz .Lread_apicid_mmio + + /* Force the AP into X2APIC mode. */ + orl $X2APIC_ENABLE, %eax + wrmsr + jmp .Lread_apicid_msr +#endif + +.Lread_apicid_mmio: /* Read the APIC ID from the fix-mapped MMIO space. */ movq apic_mmio_base(%rip), %rcx addq $APIC_ID, %rcx diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8857abc706e4..660b601f1d6c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -121,7 +121,7 @@ static const __initconst struct idt_data def_idts[] = { static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) - SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), + SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation), #elif defined(CONFIG_X86_32) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index e8babebad7b8..a0ce46c0a2d8 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -576,7 +576,8 @@ static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) { unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; - int3_emulate_call(regs, regs_get_register(regs, offs)); + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size); + int3_emulate_jmp(regs, regs_get_register(regs, offs)); } NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0ddb3bd0f1aa..c461c1a4b6af 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -803,8 +803,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ "setne %al\n\t" -DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, - PV_VCPU_PREEMPTED_ASM, .text); +DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5f71a0cf4399..e18914c0e38a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -276,7 +276,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL, + *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -286,8 +286,6 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -304,14 +302,6 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } - /* - * See alternative_instructions() for the ordering rules between the - * various patching types. - */ - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -341,7 +331,7 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } - if (calls || para) { + if (calls || alt) { struct callthunk_sites cs = {}; if (calls) { @@ -349,9 +339,9 @@ int module_finalize(const Elf_Ehdr *hdr, cs.call_end = (void *)calls->sh_addr + calls->sh_size; } - if (para) { - cs.pv_start = (void *)para->sh_addr; - cs.pv_end = (void *)para->sh_addr + para->sh_size; + if (alt) { + cs.alt_start = (void *)alt->sh_addr; + cs.alt_end = (void *)alt->sh_addr + alt->sh_size; } callthunks_patch_module_calls(&cs, me); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97f1436c1a20..5358d43886ad 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -34,14 +34,8 @@ #include <asm/io_bitmap.h> #include <asm/gsseg.h> -/* - * nop stub, which must not clobber anything *including the stack* to - * avoid confusing the entry prologues. - */ -DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); - /* stub always returning 0. */ -DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); +DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { @@ -49,26 +43,12 @@ void __init default_banner(void) pv_info.name); } -/* Undefined instruction for dealing with missing ops pointers. */ -noinstr void paravirt_BUG(void) -{ - BUG(); -} - -static unsigned paravirt_patch_call(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - __text_gen_insn(insn_buff, CALL_INSN_OPCODE, - (void *)addr, target, CALL_INSN_SIZE); - return CALL_INSN_SIZE; -} - #ifdef CONFIG_PARAVIRT_XXL -DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); -DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); +DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -85,28 +65,6 @@ static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) tlb_remove_page(tlb, table); } -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - /* - * Neat trick to map patch type back to the call within the - * corresponding structure. - */ - void *opfunc = *((void **)&pv_ops + type); - unsigned ret; - - if (opfunc == NULL) - /* If there's no function, patch it with paravirt_BUG() */ - ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); - else if (opfunc == _paravirt_nop) - ret = 0; - else - /* Otherwise call the function. */ - ret = paravirt_patch_call(insn_buff, opfunc, addr, len); - - return ret; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2c97bf7b56ae..b30d6e180df7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), - 0xFFFFF); + struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32, + per_cpu_offset(cpu), 0xFFFFF); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 70472eebe719..c67285824e82 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1234,10 +1234,6 @@ void setup_ghcb(void) if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return; - /* First make sure the hypervisor talks a supported protocol. */ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - /* * Check whether the runtime #VC exception handler is active. It uses * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). @@ -1255,6 +1251,13 @@ void setup_ghcb(void) } /* + * Make sure the hypervisor talks a supported protocol. + * This gets called only in the BSP boot phase. + */ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* * Clear the boot_ghcb. The first exception comes in before the bss * section is cleared. */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 54a5596adaa6..a349dbfc6d5a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -267,19 +267,6 @@ SECTIONS } #endif - /* - * start address and size of operations which during runtime - * can be patched with virtualization friendly instructions or - * baremetal native ones. Think page table operations. - * Details in paravirt_types.h - */ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - #ifdef CONFIG_RETPOLINE /* * List of instructions that call/jmp/jcc to retpoline thunks diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index ee8c4c3496ed..eea6ea7f14af 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) } static const struct file_operations mmu_rmaps_stat_fops = { + .owner = THIS_MODULE, .open = kvm_mmu_rmaps_stat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 238afd7335e4..4943f6b2bbee 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h if (!eventfd) return HV_STATUS_INVALID_PORT_ID; - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4900c078045a..6ee925d66648 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2972,6 +2972,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } + + /* + * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if + * the host/guest supports its use. + * + * guest_can_use() checks a number of requirements on the host/guest to + * ensure that MSR_IA32_XSS is available, but it might report true even + * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host + * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better + * to further check that the guest CPUID actually supports + * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved + * guests will still get intercepted and caught in the normal + * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. + */ + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); + else + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); } void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 712146312358..a8bd4e909a1e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_IA32_XSS, .always = false }, { .index = MSR_EFER, .always = false }, { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, @@ -1855,15 +1856,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->arch.efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { vcpu->arch.efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be67ab7fdd10..c409f934c377 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -30,7 +30,7 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 46 +#define MAX_DIRECT_ACCESS_MSRS 47 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c924075f6f1..1a3aaa7dafae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5518,8 +5518,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, - sizeof(guest_xsave->region)); + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -13031,7 +13031,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - return vcpu->arch.preempted_in_kernel; + if (vcpu != kvm_get_running_vcpu()) + return vcpu->arch.preempted_in_kernel; + + return static_call(kvm_x86_get_cpl)(vcpu) == 0; } unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index e53fad915a62..523bb6df5ac9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) if (ret < 0 && ret != -ENOTCONN) return false; } else { - eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + eventfd_signal(evtchnfd->deliver.eventfd.ctx); } *r = 0; diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index cea25ca8b8cf..c9dae65ac01b 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -11,26 +11,23 @@ #include <asm/checksum.h> #include <asm/word-at-a-time.h> -static inline unsigned short from32to16(unsigned a) +static inline __wsum csum_finalize_sum(u64 temp64) { - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; + return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32); } -static inline __wsum csum_tail(u64 temp64, int odd) +static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5]) { - unsigned int result; - - result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return (__force __wsum)result; + asm("addq %1,%0\n\t" + "adcq %2,%0\n\t" + "adcq %3,%0\n\t" + "adcq %4,%0\n\t" + "adcq %5,%0\n\t" + "adcq $0,%0" + :"+r" (sum) + :"m" (m[0]), "m" (m[1]), "m" (m[2]), + "m" (m[3]), "m" (m[4])); + return sum; } /* @@ -47,64 +44,32 @@ static inline __wsum csum_tail(u64 temp64, int odd) __wsum csum_partial(const void *buff, int len, __wsum sum) { u64 temp64 = (__force u64)sum; - unsigned odd; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - if (unlikely(len == 0)) - return sum; - temp64 = ror32((__force u32)sum, 8); - temp64 += (*(unsigned char *)buff << 8); - len--; - buff++; + /* Do two 40-byte chunks in parallel to get better ILP */ + if (likely(len >= 80)) { + u64 temp64_2 = 0; + do { + temp64 = update_csum_40b(temp64, buff); + temp64_2 = update_csum_40b(temp64_2, buff + 40); + buff += 80; + len -= 80; + } while (len >= 80); + + asm("addq %1,%0\n\t" + "adcq $0,%0" + :"+r" (temp64): "r" (temp64_2)); } /* - * len == 40 is the hot case due to IPv6 headers, but annotating it likely() - * has noticeable negative affect on codegen for all other cases with - * minimal performance benefit here. + * len == 40 is the hot case due to IPv6 headers, so return + * early for that exact case without checking the tail bytes. */ - if (len == 40) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r"(temp64) - : [src] "r"(buff), "m"(*(const char(*)[40])buff)); - return csum_tail(temp64, odd); - } - if (unlikely(len >= 64)) { - /* - * Extra accumulators for better ILP in the loop. - */ - u64 tmp_accum, tmp_carries; - - asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t" - "xorl %k[tmp_carries],%k[tmp_carries]\n\t" - "subl $64, %[len]\n\t" - "1:\n\t" - "addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq 4*8(%[src]),%[tmp_accum]\n\t" - "adcq 5*8(%[src]),%[tmp_accum]\n\t" - "adcq 6*8(%[src]),%[tmp_accum]\n\t" - "adcq 7*8(%[src]),%[tmp_accum]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq $64, %[src]\n\t" - "subl $64, %[len]\n\t" - "jge 1b\n\t" - "addq %[tmp_accum],%[res]\n\t" - "adcq %[tmp_carries],%[res]\n\t" - "adcq $0,%[res]" - : [tmp_accum] "=&r"(tmp_accum), - [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64), - [len] "+r"(len), [src] "+r"(buff) - : "m"(*(const char *)buff)); + if (len >= 40) { + temp64 = update_csum_40b(temp64, buff); + len -= 40; + if (!len) + return csum_finalize_sum(temp64); + buff += 40; } if (len & 32) { @@ -143,7 +108,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) : [res] "+r"(temp64) : [trail] "r"(trail)); } - return csum_tail(temp64, odd); + return csum_finalize_sum(temp64); } EXPORT_SYMBOL(csum_partial); diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index 92cd8ecc3a2c..40b81c338ae5 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -8,7 +8,7 @@ */ int num_digits(int val) { - int m = 10; + long long m = 10; int d = 1; if (val < 0) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ab778eac1952..679b09cfe241 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1370,6 +1370,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index a68f2dda0948..70b91de2e053 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -32,6 +32,7 @@ #include <asm/msr.h> #include <asm/cmdline.h> #include <asm/sev.h> +#include <asm/ia32.h> #include "mm_internal.h" @@ -481,6 +482,16 @@ void __init sme_early_init(void) */ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) x86_cpuinit.parallel_bringup = false; + + /* + * The VMM is capable of injecting interrupt 0x80 and triggering the + * compatibility syscall path. + * + * By default, the 32-bit emulation is disabled in order to ensure + * the safety of the VM. + */ + if (sev_status & MSR_AMD64_SEV_ENABLED) + ia32_disable(); } void __init mem_encrypt_free_decrypted_mem(void) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c10d9abc239..e89e415aa743 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp #endif WARN(1, "verification of programs using bpf_throw should have failed\n"); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index c4365a05ab83..7c6a1089ce1c 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -11,6 +11,7 @@ #include <linux/elfnote.h> #include <linux/init.h> #include <linux/linkage.h> +#include <asm/desc_defs.h> #include <asm/segment.h> #include <asm/asm.h> #include <asm/boot.h> @@ -148,11 +149,11 @@ SYM_DATA_END(gdt) SYM_DATA_START_LOCAL(gdt_start) .quad 0x0000000000000000 /* NULL descriptor */ #ifdef CONFIG_X86_64 - .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */ + .quad GDT_ENTRY(DESC_CODE64, 0, 0xfffff) /* PVH_CS_SEL */ #else - .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */ + .quad GDT_ENTRY(DESC_CODE32, 0, 0xfffff) /* PVH_CS_SEL */ #endif - .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */ + .quad GDT_ENTRY(DESC_DATA32, 0, 0xfffff) /* PVH_DS_SEL */ SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 4221259a5870..a379501b7a69 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -35,7 +35,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; - entry->delivery_mode = apic->delivery_mode; + entry->delivery_mode = APIC_DELIVERY_MODE_FIXED; entry->dest_mode = apic->dest_mode_logical; entry->polarity = 0; entry->trigger = 0; diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index f10515b10e0a..e714b4624e36 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> +#include <asm/desc_defs.h> #include <asm/segment.h> #include <asm/page_types.h> #include <asm/processor-flags.h> @@ -153,5 +154,5 @@ SYM_DATA_START(machine_real_restart_gdt) * base value 0x100; since this is consistent with real mode * semantics we don't have to reload the segments once CR0.PE = 0. */ - .quad GDT_ENTRY(0x0093, 0x100, 0xffff) + .quad GDT_ENTRY(DESC_DATA16, 0x100, 0xffff) SYM_DATA_END(machine_real_restart_gdt) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index d30949e25ebd..a3bae2b24626 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -66,7 +66,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { [S_REL] = "^(__init_(begin|end)|" "__x86_cpu_dev_(start|end)|" - "(__parainstructions|__alt_instructions)(_end)?|" + "__alt_instructions(_end)?|" "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|" "__(start|end)_pci_.*|" #if CONFIG_FW_LOADER diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 9b1ec5d8c99c..a65fc2ae15b4 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,6 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0337392a3121..3c61bb98c10e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,9 +33,12 @@ EXPORT_SYMBOL_GPL(hypercall_page); * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info * but during boot it is switched to point to xen_vcpu_info. * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events. + * Make sure that xen_vcpu_info doesn't cross a page boundary by making it + * cache-line aligned (the struct is guaranteed to have a size of 64 bytes, + * which matches the cache line size of 64-bit x86 processors). */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ DEFINE_PER_CPU(uint32_t, xen_vcpu_id); @@ -160,6 +163,7 @@ void xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES); BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index bbbfdd495ebd..aeb33e0a3f76 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -704,7 +704,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 6092fea7d651..39982f955cfe 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = { /* Initial interrupt flag handling only called while interrupts off. */ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), - .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), + .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func), .safe_halt = xen_safe_halt, .halt = xen_halt, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 9e5e68008785..1a9cd18dfbd3 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION -xen_pv_trap entry_INT80_compat +xen_pv_trap asm_int80_emulation #endif xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 408a2aa66c69..a87ab36889e7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -21,7 +21,7 @@ extern void *xen_initial_gdt; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); |