summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig26
-rw-r--r--arch/x86/boot/compressed/idt_64.c9
-rw-r--r--arch/x86/boot/compressed/sev.c37
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/entry/entry_32.S53
-rw-r--r--arch/x86/entry/entry_64.S43
-rw-r--r--arch/x86/entry/vdso/vma.c4
-rw-r--r--arch/x86/events/intel/core.c7
-rw-r--r--arch/x86/hyperv/hv_apic.c4
-rw-r--r--arch/x86/hyperv/hv_init.c21
-rw-r--r--arch/x86/hyperv/hv_vtl.c4
-rw-r--r--arch/x86/hyperv/ivm.c7
-rw-r--r--arch/x86/hyperv/mmu.c12
-rw-r--r--arch/x86/hyperv/nested.c11
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h14
-rw-r--r--arch/x86/include/asm/entry-common.h1
-rw-r--r--arch/x86/include/asm/ibt.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/linkage.h2
-rw-r--r--arch/x86/include/asm/microcode.h1
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/msr-index.h13
-rw-r--r--arch/x86/include/asm/nospec-branch.h58
-rw-r--r--arch/x86/include/asm/processor.h12
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/switch_to.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/alternative.c75
-rw-r--r--arch/x86/kernel/cpu/amd.c239
-rw-r--r--arch/x86/kernel/cpu/bugs.c371
-rw-r--r--arch/x86/kernel/cpu/common.c46
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c4
-rw-r--r--arch/x86/kernel/ftrace.c1
-rw-r--r--arch/x86/kernel/kprobes/opt.c40
-rw-r--r--arch/x86/kernel/module.c2
-rw-r--r--arch/x86/kernel/process.c22
-rw-r--r--arch/x86/kernel/static_call.c13
-rw-r--r--arch/x86/kernel/traps.c18
-rw-r--r--arch/x86/kernel/vmlinux.lds.S37
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/lapic.c25
-rw-r--r--arch/x86/kvm/svm/sev.c124
-rw-r--r--arch/x86/kvm/svm/svm.c22
-rw-r--r--arch/x86/kvm/svm/svm.h26
-rw-r--r--arch/x86/kvm/svm/vmenter.S4
-rw-r--r--arch/x86/kvm/vmx/vmenter.S8
-rw-r--r--arch/x86/kvm/vmx/vmx.c62
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h12
-rw-r--r--arch/x86/kvm/x86.c55
-rw-r--r--arch/x86/lib/retpoline.S160
-rw-r--r--arch/x86/xen/xen-head.S37
57 files changed, 1356 insertions, 418 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..e36261b4ea14 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2593,6 +2593,13 @@ config CPU_IBRS_ENTRY
This mitigates both spectre_v2 and retbleed at great cost to
performance.
+config CPU_SRSO
+ bool "Mitigate speculative RAS overflow on AMD"
+ depends on CPU_SUP_AMD && X86_64 && RETHUNK
+ default y
+ help
+ Enable the SRSO mitigation needed on AMD Zen1-4 machines.
+
config SLS
bool "Mitigate Straight-Line-Speculation"
depends on CC_HAS_SLS && X86_64
@@ -2603,6 +2610,25 @@ config SLS
against straight line speculation. The kernel image might be slightly
larger.
+config GDS_FORCE_MITIGATION
+ bool "Force GDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default n
+ help
+ Gather Data Sampling (GDS) is a hardware vulnerability which allows
+ unprivileged speculative access to data which was previously stored in
+ vector registers.
+
+ This option is equivalent to setting gather_data_sampling=force on the
+ command line. The microcode mitigation is used if present, otherwise
+ AVX is disabled as a mitigation. On affected systems that are missing
+ the microcode any userspace code that unconditionally uses AVX will
+ break with this option set.
+
+ Setting this option on systems not vulnerable to GDS has no effect.
+
+ If in doubt, say N.
+
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 6debb816e83d..3cdf94b41456 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -63,7 +63,14 @@ void load_stage2_idt(void)
set_idt_entry(X86_TRAP_PF, boot_page_fault);
#ifdef CONFIG_AMD_MEM_ENCRYPT
- set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+ /*
+ * Clear the second stage #VC handler in case guest types
+ * needing #VC have not been detected.
+ */
+ if (sev_status & BIT(1))
+ set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+ else
+ set_idt_entry(X86_TRAP_VC, NULL);
#endif
load_boot_idt(&boot_idt_desc);
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..c3e343bd4760 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -405,12 +405,45 @@ void sev_enable(struct boot_params *bp)
bp->cc_blob_address = 0;
/*
+ * Do an initial SEV capability check before snp_init() which
+ * loads the CPUID page and the same checks afterwards are done
+ * without the hypervisor and are trustworthy.
+ *
+ * If the HV fakes SEV support, the guest will crash'n'burn
+ * which is good enough.
+ */
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV is supported */
+ if (!(eax & BIT(1)))
+ return;
+
+ /*
* Setup/preliminary detection of SNP. This will be sanity-checked
* against CPUID/MSR values later.
*/
snp = snp_init(bp);
- /* Check for the SME/SEV support leaf */
+ /* Now repeat the checks with the SNP CPUID table. */
+
+ /* Recheck the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
@@ -418,7 +451,7 @@ void sev_enable(struct boot_params *bp)
return;
/*
- * Check for the SME/SEV feature:
+ * Recheck for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 3cf34912abfe..75a343f10e58 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -245,7 +245,7 @@ CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
-CONFIG_AUTOFS4_FS=y
+CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 27759236fd60..0902518e9b93 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -242,7 +242,7 @@ CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
-CONFIG_AUTOFS4_FS=y
+CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 91397f58ac30..6e6af42e044a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -720,26 +720,6 @@ SYM_CODE_END(__switch_to_asm)
.popsection
/*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack. This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-.pushsection .text, "ax"
-SYM_FUNC_START(schedule_tail_wrapper)
- FRAME_BEGIN
-
- pushl %eax
- call schedule_tail
- popl %eax
-
- FRAME_END
- RET
-SYM_FUNC_END(schedule_tail_wrapper)
-.popsection
-
-/*
* A newly forked process directly context switches into this address.
*
* eax: prev task we switched from
@@ -747,29 +727,22 @@ SYM_FUNC_END(schedule_tail_wrapper)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
- call schedule_tail_wrapper
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
+ /* return address for the stack unwinder */
+ pushl $.Lsyscall_32_done
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- movl %esp, %eax
- call syscall_exit_to_user_mode
- jmp .Lsyscall_32_done
+ FRAME_BEGIN
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
+ addl $4, %esp
+ FRAME_END
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+ RET
+SYM_CODE_END(ret_from_fork_asm)
.popsection
SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..43606de22511 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,33 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
+ /*
+ * This is the start of the kernel stack; even through there's a
+ * register set at the top, the regset isn't necessarily coherent
+ * (consider kthreads) and one cannot unwind further.
+ *
+ * This ensures stack unwinds of kernel threads terminate in a known
+ * good state.
+ */
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork
-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
- jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
/*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
+ * Set the stack state to what is expected for the target function
+ * -- at this point the register set should be a valid user set
+ * and unwind should work normally.
*/
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+ UNWIND_HINT_REGS
+ jmp swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(ret_from_fork_asm)
.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 11a5c68d1218..7645730dc228 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -299,8 +299,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
/* Round the lowest possible end address up to a PMD boundary. */
end = (start + len + PMD_SIZE - 1) & PMD_MASK;
- if (end >= TASK_SIZE_MAX)
- end = TASK_SIZE_MAX;
+ if (end >= DEFAULT_MAP_WINDOW)
+ end = DEFAULT_MAP_WINDOW;
end -= len;
if (end > start) {
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a149fafad813..2a284ba951b7 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3993,6 +3993,13 @@ static int intel_pmu_hw_config(struct perf_event *event)
struct perf_event *leader = event->group_leader;
struct perf_event *sibling = NULL;
+ /*
+ * When this memload event is also the first event (no group
+ * exists yet), then there is no aux event before it.
+ */
+ if (leader == event)
+ return -ENODATA;
+
if (!is_mem_loads_aux_event(leader)) {
for_each_sibling_event(sibling, leader) {
if (is_mem_loads_aux_event(sibling))
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 1fbda2f94184..b21335e6a210 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -107,7 +107,6 @@ static bool cpu_is_self(int cpu)
static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
bool exclude_self)
{
- struct hv_send_ipi_ex **arg;
struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
@@ -117,9 +116,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
return false;
local_irq_save(flags);
- arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ ipi_arg = *this_cpu_ptr(hyperv_pcpu_input_arg);
- ipi_arg = *arg;
if (unlikely(!ipi_arg))
goto ipi_mask_ex_done;
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 6c04b52f139b..953e280c07c3 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -14,6 +14,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/sev.h>
+#include <asm/ibt.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -472,6 +473,26 @@ void __init hyperv_init(void)
}
/*
+ * Some versions of Hyper-V that provide IBT in guest VMs have a bug
+ * in that there's no ENDBR64 instruction at the entry to the
+ * hypercall page. Because hypercalls are invoked via an indirect call
+ * to the hypercall page, all hypercall attempts fail when IBT is
+ * enabled, and Linux panics. For such buggy versions, disable IBT.
+ *
+ * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall
+ * page, so if future Linux kernel versions enable IBT for 32-bit
+ * builds, additional hypercall page hackery will be required here
+ * to provide an ENDBR32.
+ */
+#ifdef CONFIG_X86_KERNEL_IBT
+ if (cpu_feature_enabled(X86_FEATURE_IBT) &&
+ *(u32 *)hv_hypercall_pg != gen_endbr()) {
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n");
+ }
+#endif
+
+ /*
* hyperv_init() is called before LAPIC is initialized: see
* apic_intr_mode_init() -> x86_platform.apic_post_init() and
* apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 85d38b9f3586..db5d2ea39fc0 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -25,6 +25,10 @@ void __init hv_vtl_init_platform(void)
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
x86_platform.get_nmi_reason = hv_get_nmi_reason;
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 14f46ad2ca64..28be6df88063 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
enum hv_mem_host_visibility visibility)
{
- struct hv_gpa_range_for_visibility **input_pcpu, *input;
+ struct hv_gpa_range_for_visibility *input;
u16 pages_processed;
u64 hv_status;
unsigned long flags;
@@ -263,9 +263,8 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
}
local_irq_save(flags);
- input_pcpu = (struct hv_gpa_range_for_visibility **)
- this_cpu_ptr(hyperv_pcpu_input_arg);
- input = *input_pcpu;
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
if (unlikely(!input)) {
local_irq_restore(flags);
return -EINVAL;
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 8460bd35e10c..1cc113200ff5 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -61,7 +61,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
- struct hv_tlb_flush **flush_pcpu;
struct hv_tlb_flush *flush;
u64 status;
unsigned long flags;
@@ -74,10 +73,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
local_irq_save(flags);
- flush_pcpu = (struct hv_tlb_flush **)
- this_cpu_ptr(hyperv_pcpu_input_arg);
-
- flush = *flush_pcpu;
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
if (unlikely(!flush)) {
local_irq_restore(flags);
@@ -178,17 +174,13 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
- struct hv_tlb_flush_ex **flush_pcpu;
struct hv_tlb_flush_ex *flush;
u64 status;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
return HV_STATUS_INVALID_PARAMETER;
- flush_pcpu = (struct hv_tlb_flush_ex **)
- this_cpu_ptr(hyperv_pcpu_input_arg);
-
- flush = *flush_pcpu;
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
if (info->mm) {
/*
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index 5d70968c8538..9dc259fa322e 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -19,7 +19,6 @@
int hyperv_flush_guest_mapping(u64 as)
{
- struct hv_guest_mapping_flush **flush_pcpu;
struct hv_guest_mapping_flush *flush;
u64 status;
unsigned long flags;
@@ -30,10 +29,7 @@ int hyperv_flush_guest_mapping(u64 as)
local_irq_save(flags);
- flush_pcpu = (struct hv_guest_mapping_flush **)
- this_cpu_ptr(hyperv_pcpu_input_arg);
-
- flush = *flush_pcpu;
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
if (unlikely(!flush)) {
local_irq_restore(flags);
@@ -90,7 +86,6 @@ EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list);
int hyperv_flush_guest_mapping_range(u64 as,
hyperv_fill_flush_list_func fill_flush_list_func, void *data)
{
- struct hv_guest_mapping_flush_list **flush_pcpu;
struct hv_guest_mapping_flush_list *flush;
u64 status;
unsigned long flags;
@@ -102,10 +97,8 @@ int hyperv_flush_guest_mapping_range(u64 as,
local_irq_save(flags);
- flush_pcpu = (struct hv_guest_mapping_flush_list **)
- this_cpu_ptr(hyperv_pcpu_input_arg);
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
- flush = *flush_pcpu;
if (unlikely(!flush)) {
local_irq_restore(flags);
goto fault;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 8eb74cf386db..2888c0ee4df0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -15,6 +15,7 @@
#include <asm/mpspec.h>
#include <asm/x86_init.h>
#include <asm/cpufeature.h>
+#include <asm/irq_vectors.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
@@ -31,6 +32,7 @@ extern int acpi_skip_timer_override;
extern int acpi_use_timer_override;
extern int acpi_fix_pin2_polarity;
extern int acpi_disable_cmcff;
+extern bool acpi_int_src_ovr[NR_IRQS_LEGACY];
extern u8 acpi_sci_flags;
extern u32 acpi_sci_override_gsi;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 6c15a622ad60..9c4da699e11a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -96,7 +96,7 @@ extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
extern void apply_retpolines(s32 *start, s32 *end);
extern void apply_returns(s32 *start, s32 *end);
-extern void apply_ibt_endbr(s32 *start, s32 *end);
+extern void apply_seal_endbr(s32 *start, s32 *end);
extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
s32 *start_cfi, s32 *end_cfi);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cb8ca46213be..b69b0d7756aa 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -14,7 +14,7 @@
* Defines x86 CPU feature bits
*/
#define NCAPINTS 21 /* N 32-bit words worth of info */
-#define NBUGINTS 1 /* N 32-bit bug flags */
+#define NBUGINTS 2 /* N 32-bit bug flags */
/*
* Note: If the comment begins with a quoted string, that string is used
@@ -309,6 +309,10 @@
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
@@ -442,6 +446,10 @@
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
+
/*
* BUG word(s)
*/
@@ -483,5 +491,9 @@
#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */
+/* BUG word 2 */
+#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
+#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 117903881fe4..ce8f50192ae3 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -92,6 +92,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
mds_user_clear_cpu_buffers();
+ amd_clear_divider();
}
#define arch_exit_to_user_mode arch_exit_to_user_mode
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index baae6b4fea23..1e59581d500c 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -34,7 +34,7 @@
/*
* Create a dummy function pointer reference to prevent objtool from marking
* the function as needing to be "sealed" (i.e. ENDBR converted to NOP by
- * apply_ibt_endbr()).
+ * apply_seal_endbr()).
*/
#define IBT_NOSEAL(fname) \
".pushsection .discard.ibt_endbr_noseal\n\t" \
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 13bc212cd4bc..e3054e3e46d5 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -37,6 +37,7 @@ KVM_X86_OP(get_segment)
KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
KVM_X86_OP(get_cs_db_l_bits)
+KVM_X86_OP(is_valid_cr0)
KVM_X86_OP(set_cr0)
KVM_X86_OP_OPTIONAL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28bd38303d70..3bc146dfd38d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1566,9 +1566,10 @@ struct kvm_x86_ops {
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 0953aa32a324..97a3de7892d3 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -21,7 +21,7 @@
#define FUNCTION_PADDING
#endif
-#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO)
+#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING
#else
# define __FUNC_ALIGN __ALIGN
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 320566a0443d..66dbba181bd9 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -5,6 +5,7 @@
#include <asm/cpu.h>
#include <linux/earlycpio.h>
#include <linux/initrd.h>
+#include <asm/microcode_amd.h>
struct ucode_patch {
struct list_head plist;
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index e6662adf3af4..9675c621c1ca 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -48,11 +48,13 @@ extern void __init load_ucode_amd_bsp(unsigned int family);
extern void load_ucode_amd_ap(unsigned int family);
extern int __init save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
+extern void amd_check_microcode(void);
#else
static inline void __init load_ucode_amd_bsp(unsigned int family) {}
static inline void load_ucode_amd_ap(unsigned int family) {}
static inline int __init
save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) {}
+static inline void amd_check_microcode(void) {}
#endif
#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 88d9ef98e087..fa83d88e4c99 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -5,7 +5,7 @@
#include <linux/types.h>
#include <linux/nmi.h>
#include <linux/msi.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3aedae61af4f..1d111350197f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -57,6 +57,7 @@
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */
#define MSR_PPIN_CTL 0x0000004e
#define MSR_PPIN 0x0000004f
@@ -155,6 +156,15 @@
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
*/
+#define ARCH_CAP_GDS_CTRL BIT(25) /*
+ * CPU is vulnerable to Gather
+ * Data Sampling (GDS) and
+ * has controls for mitigation.
+ */
+#define ARCH_CAP_GDS_NO BIT(26) /*
+ * CPU is not vulnerable to Gather
+ * Data Sampling (GDS).
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
@@ -178,6 +188,8 @@
#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
#define RTM_ALLOW BIT(1) /* TSX development mode */
#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
+#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */
+#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -545,6 +557,7 @@
#define MSR_AMD64_DE_CFG 0xc0011029
#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9
#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 55388c9f7601..c55cc243592e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -211,7 +211,8 @@
* eventually turn into it's own annotation.
*/
.macro VALIDATE_UNRET_END
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+ (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
ANNOTATE_RETPOLINE_SAFE
nop
#endif
@@ -234,6 +235,10 @@
* JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
* indirect jmp/call which may be susceptible to the Spectre variant 2
* attack.
+ *
+ * NOTE: these do not take kCFI into account and are thus not comparable to C
+ * indirect calls, take care when using. The target of these should be an ENDBR
+ * instruction irrespective of kCFI.
*/
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
@@ -267,9 +272,9 @@
.endm
#ifdef CONFIG_CPU_UNRET_ENTRY
-#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret"
+#define CALL_UNTRAIN_RET "call entry_untrain_ret"
#else
-#define CALL_ZEN_UNTRAIN_RET ""
+#define CALL_UNTRAIN_RET ""
#endif
/*
@@ -277,7 +282,7 @@
* return thunk isn't mapped into the userspace tables (then again, AMD
* typically has NO_MELTDOWN).
*
- * While zen_untrain_ret() doesn't clobber anything but requires stack,
+ * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
* entry_ibpb() will clobber AX, CX, DX.
*
* As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
@@ -285,21 +290,32 @@
*/
.macro UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
- defined(CONFIG_CALL_DEPTH_TRACKING)
+ defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
+.macro UNTRAIN_RET_VM
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
+ VALIDATE_UNRET_END
+ ALTERNATIVE_3 "", \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \
+ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
.macro UNTRAIN_RET_FROM_CALL
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
defined(CONFIG_CALL_DEPTH_TRACKING)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
#endif
@@ -326,15 +342,24 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
+#ifdef CONFIG_RETHUNK
extern void __x86_return_thunk(void);
-extern void zen_untrain_ret(void);
+#else
+static inline void __x86_return_thunk(void) {}
+#endif
+
+extern void retbleed_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
+extern void retbleed_untrain_ret(void);
+extern void srso_untrain_ret(void);
+extern void srso_alias_untrain_ret(void);
+
+extern void entry_untrain_ret(void);
extern void entry_ibpb(void);
-#ifdef CONFIG_CALL_THUNKS
extern void (*x86_return_thunk)(void);
-#else
-#define x86_return_thunk (&__x86_return_thunk)
-#endif
#ifdef CONFIG_CALL_DEPTH_TRACKING
extern void __x86_return_skl(void);
@@ -461,9 +486,6 @@ enum ssb_mitigation {
SPEC_STORE_BYPASS_SECCOMP,
};
-extern char __indirect_thunk_start[];
-extern char __indirect_thunk_end[];
-
static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
@@ -475,11 +497,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
: "memory");
}
+extern u64 x86_pred_cmd;
+
static inline void indirect_branch_prediction_barrier(void)
{
- u64 val = PRED_CMD_IBPB;
-
- alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
+ alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
}
/* The Intel SPEC CTRL MSR base value cache */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d46300e94f85..fd750247ca89 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -586,7 +586,6 @@ extern char ignore_fpu_irq;
#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
-#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
# define BASE_PREFETCH ""
@@ -620,11 +619,6 @@ static __always_inline void prefetchw(const void *x)
"m" (*(const char *)x));
}
-static inline void spin_lock_prefetch(const void *x)
-{
- prefetchw(x);
-}
-
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
@@ -682,9 +676,13 @@ extern u16 get_llc_id(unsigned int cpu);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
+extern bool cpu_has_ibpb_brtype_microcode(void);
+extern void amd_clear_divider(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
+static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
+static inline void amd_clear_divider(void) { }
#endif
extern unsigned long arch_align_stack(unsigned long sp);
@@ -727,4 +725,6 @@ bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
+extern bool gds_ucode_mitigated(void);
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 794f69625780..9d6411c65920 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -56,7 +56,7 @@
#define GDT_ENTRY_INVALID_SEG 0
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64)
/*
* The layout of the per-CPU GDT under Linux:
*
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);
/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 21b542a6866c..53369c57751e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -52,6 +52,7 @@ int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
int acpi_disable_cmcff;
+bool acpi_int_src_ovr[NR_IRQS_LEGACY];
/* ACPI SCI override configuration */
u8 acpi_sci_flags __initdata;
@@ -588,6 +589,9 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
acpi_table_print_madt_entry(&header->common);
+ if (intsrc->source_irq < NR_IRQS_LEGACY)
+ acpi_int_src_ovr[intsrc->source_irq] = true;
+
if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
acpi_sci_ioapic_setup(intsrc->source_irq,
intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 72646d75b6ff..099d58d02a26 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -687,10 +687,6 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
#ifdef CONFIG_RETHUNK
-#ifdef CONFIG_CALL_THUNKS
-void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
-#endif
-
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -778,6 +774,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr);
+
static void __init_or_module poison_endbr(void *addr, bool warn)
{
u32 endbr, poison = gen_endbr_poison();
@@ -802,8 +800,11 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
/*
* Generated by: objtool --ibt
+ *
+ * Seal the functions for indirect calls by clobbering the ENDBR instructions
+ * and the kCFI hash value.
*/
-void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
{
s32 *s;
@@ -812,13 +813,13 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
poison_endbr(addr, true);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_endbr(addr - 16, false);
+ poison_cfi(addr - 16);
}
}
#else
-void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
#endif /* CONFIG_X86_KERNEL_IBT */
@@ -1063,6 +1064,17 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end)
return 0;
}
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ poison_endbr(addr+16, false);
+ }
+}
+
/* .retpoline_sites */
static int cfi_rand_callers(s32 *start, s32 *end)
{
@@ -1157,14 +1169,19 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
return;
case CFI_FINEIBT:
+ /* place the FineIBT preamble at func()-16 */
ret = cfi_rewrite_preamble(start_cfi, end_cfi);
if (ret)
goto err;
+ /* rewrite the callers to target func()-16 */
ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
+ /* now that nobody targets func()+0, remove ENDBR there */
+ cfi_rewrite_endbr(start_cfi, end_cfi);
+
if (builtin)
pr_info("Using FineIBT CFI\n");
return;
@@ -1177,6 +1194,41 @@ err:
pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
}
+static inline void poison_hash(void *addr)
+{
+ *(u32 *)addr = 0;
+}
+
+static void poison_cfi(void *addr)
+{
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ /*
+ * __cfi_\func:
+ * osp nopl (%rax)
+ * subl $0, %r10d
+ * jz 1f
+ * ud2
+ * 1: nop
+ */
+ poison_endbr(addr, false);
+ poison_hash(addr + fineibt_preamble_hash);
+ break;
+
+ case CFI_KCFI:
+ /*
+ * __cfi_\func:
+ * movl $0, %eax
+ * .skip 11, 0x90
+ */
+ poison_hash(addr + 1);
+ break;
+
+ default:
+ break;
+ }
+}
+
#else
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
@@ -1184,6 +1236,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
{
}
+#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr) { }
+#endif
+
#endif
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
@@ -1565,7 +1621,10 @@ void __init alternative_instructions(void)
*/
callthunks_patch_builtin_calls();
- apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ /*
+ * Seal all functions that do not have their address taken.
+ */
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 571abf808ea3..7eca6a8abbb1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -27,11 +27,6 @@
#include "cpu.h"
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static const int amd_erratum_1054[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-
/*
* nodes_per_socket: Stores the number of nodes per socket.
* Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -39,6 +34,83 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
*/
static u32 nodes_per_socket = 1;
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE().
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+static const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+static const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
+static const int amd_zenbleed[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
+ AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
+
+static const int amd_div0[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
+{
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 4) | cpu->x86_stepping;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -916,6 +988,47 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
}
}
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010b; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608105; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701032; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+
+ default:
+ return false;
+ break;
+ }
+
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
+
+static void zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has_amd_erratum(c, amd_zenbleed))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ }
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
early_init_amd(c);
@@ -1020,6 +1133,13 @@ static void init_amd(struct cpuinfo_x86 *c)
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
cpu_has(c, X86_FEATURE_AUTOIBRS))
WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
+
+ zenbleed_check(c);
+
+ if (cpu_has_amd_erratum(c, amd_div0)) {
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
+ }
}
#ifdef CONFIG_X86_32
@@ -1115,73 +1235,6 @@ static const struct cpu_dev amd_cpu_dev = {
cpu_dev_register(amd_cpu_dev);
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
static unsigned int amd_msr_dr_addr_masks[] = {
@@ -1235,3 +1288,45 @@ u32 amd_get_highest_perf(void)
return 255;
}
EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ zenbleed_check(c);
+}
+
+void amd_check_microcode(void)
+{
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
+}
+
+bool cpu_has_ibpb_brtype_microcode(void)
+{
+ switch (boot_cpu_data.x86) {
+ /* Zen1/2 IBPB flushes branch type predictions too. */
+ case 0x17:
+ return boot_cpu_has(X86_FEATURE_AMD_IBPB);
+ case 0x19:
+ /* Poke the MSR bit on Zen3/4 to check its presence. */
+ if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ return true;
+ } else {
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+/*
+ * Issue a DIV 0/1 insn to clear any division data from previous DIV
+ * operations.
+ */
+void noinstr amd_clear_divider(void)
+{
+ asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
+ :: "a" (0), "d" (0), "r" (1));
+}
+EXPORT_SYMBOL_GPL(amd_clear_divider);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 9e2a91830f72..f081d26616ac 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -47,6 +47,8 @@ static void __init taa_select_mitigation(void);
static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
+static void __init srso_select_mitigation(void);
+static void __init gds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -56,8 +58,13 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+EXPORT_SYMBOL_GPL(x86_pred_cmd);
+
static DEFINE_MUTEX(spec_ctrl_mutex);
+void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
{
@@ -160,6 +167,13 @@ void __init cpu_select_mitigations(void)
md_clear_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+
+ /*
+ * srso_select_mitigation() depends and must run after
+ * retbleed_select_mitigation().
+ */
+ srso_select_mitigation();
+ gds_select_mitigation();
}
/*
@@ -646,6 +660,149 @@ static int __init l1d_flush_parse_cmdline(char *str)
early_param("l1d_flush", l1d_flush_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
+#else
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
+#endif
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ return;
+ };
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ goto out;
+ }
+
+ if (cpu_mitigations_off())
+ gds_mitigation = GDS_MITIGATION_OFF;
+ /* Will verify below that mitigation _can_ be disabled */
+
+ /* No microcode */
+ if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ } else {
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ }
+ goto out;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+
+ update_gds_msr();
+out:
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -885,6 +1042,9 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
+ if (IS_ENABLED(CONFIG_RETHUNK))
+ x86_return_thunk = retbleed_return_thunk;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
pr_err(RETBLEED_UNTRAIN_MSG);
@@ -894,6 +1054,7 @@ do_cmd_auto:
case RETBLEED_MITIGATION_IBPB:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true;
break;
@@ -1150,19 +1311,21 @@ spectre_v2_user_select_mitigation(void)
}
/*
- * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP
+ * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP
* is not required.
*
- * Enhanced IBRS also protects against cross-thread branch target
+ * Intel's Enhanced IBRS also protects against cross-thread branch target
* injection in user-mode as the IBRS bit remains always set which
* implicitly enables cross-thread protections. However, in legacy IBRS
* mode, the IBRS bit is set only on kernel entry and cleared on return
- * to userspace. This disables the implicit cross-thread protection,
- * so allow for STIBP to be selected in that case.
+ * to userspace. AMD Automatic IBRS also does not protect userspace.
+ * These modes therefore disable the implicit cross-thread protection,
+ * so allow for STIBP to be selected in those cases.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
!smt_possible ||
- spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
return;
/*
@@ -2186,6 +2349,170 @@ static int __init l1tf_cmdline(char *str)
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+enum srso_mitigation_cmd {
+ SRSO_CMD_OFF,
+ SRSO_CMD_MICROCODE,
+ SRSO_CMD_SAFE_RET,
+ SRSO_CMD_IBPB,
+ SRSO_CMD_IBPB_ON_VMEXIT,
+};
+
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
+static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET;
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_cmd = SRSO_CMD_OFF;
+ else if (!strcmp(str, "microcode"))
+ srso_cmd = SRSO_CMD_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_cmd = SRSO_CMD_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_cmd = SRSO_CMD_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ bool has_microcode;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ goto pred_cmd;
+
+ /*
+ * The first check is for the kernel running as a guest in order
+ * for guests to verify whether IBPB is a viable mitigation.
+ */
+ has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
+ if (!has_microcode) {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+ } else {
+ /*
+ * Enable the synthetic (even if in a real CPUID leaf)
+ * flags for guests.
+ */
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+
+ /*
+ * Zen1/2 with SMT off aren't vulnerable after the right
+ * IBPB microcode has been applied.
+ */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ return;
+ }
+ }
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (has_microcode) {
+ pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n");
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ goto pred_cmd;
+ }
+ }
+
+ switch (srso_cmd) {
+ case SRSO_CMD_OFF:
+ return;
+
+ case SRSO_CMD_MICROCODE:
+ if (has_microcode) {
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ pr_warn(SRSO_NOTICE);
+ }
+ break;
+
+ case SRSO_CMD_SAFE_RET:
+ if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ case SRSO_CMD_IBPB:
+ if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ if (has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ }
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ case SRSO_CMD_IBPB_ON_VMEXIT:
+ if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ }
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode"));
+
+pred_cmd:
+ if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) &&
+ boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) fmt
#ifdef CONFIG_SYSFS
@@ -2294,7 +2621,8 @@ static ssize_t mmio_stale_data_show_state(char *buf)
static char *stibp_state(void)
{
- if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))
return "";
switch (spectre_v2_user_stibp) {
@@ -2382,6 +2710,21 @@ static ssize_t retbleed_show_state(char *buf)
return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
}
+static ssize_t srso_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_SRSO_NO))
+ return sysfs_emit(buf, "Mitigation: SMT disabled\n");
+
+ return sysfs_emit(buf, "%s%s\n",
+ srso_strings[srso_mitigation],
+ (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -2431,6 +2774,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_RETBLEED:
return retbleed_show_state(buf);
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
default:
break;
}
@@ -2495,4 +2844,14 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha
{
return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 52683fddafaf..e3a65e9fc750 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1250,6 +1250,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RETBLEED BIT(3)
/* CPU is affected by SMT (cross-thread) return predictions */
#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1262,27 +1266,30 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED | SMT_RSB),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+ VULNBL_AMD(0x19, SRSO),
{}
};
@@ -1406,6 +1413,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
setup_force_cpu_bug(X86_BUG_SMT_RSB);
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1962,6 +1984,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
tsx_ap_init();
}
@@ -2287,6 +2311,8 @@ void microcode_check(struct cpuinfo_x86 *prev_info)
perf_check_microcode();
+ amd_check_microcode();
+
store_cpu_caps(&curr_info);
if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1c44630d4789..1dcd7d4e38ef 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -83,6 +83,7 @@ void cpu_select_mitigations(void);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
extern enum spectre_v2_mitigation spectre_v2_enabled;
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 5e74610b39e7..c4ec4ca47e11 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1261,10 +1261,10 @@ static void __threshold_remove_blocks(struct threshold_bank *b)
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
- kobject_del(b->kobj);
+ kobject_put(b->kobj);
list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
- kobject_del(&pos->kobj);
+ kobject_put(b->kobj);
}
static void threshold_remove_bank(struct threshold_bank *bank)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 01e8f34daf22..12df54ff0e81 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -282,7 +282,6 @@ static inline void tramp_free(void *tramp) { }
/* Defined as markers to the end of the ftrace default trampolines */
extern void ftrace_regs_caller_end(void);
-extern void ftrace_regs_caller_ret(void);
extern void ftrace_caller_end(void);
extern void ftrace_caller_op_ptr(void);
extern void ftrace_regs_caller_op_ptr(void);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 57b0037d0a99..517821b48391 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -226,7 +226,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
}
/* Check whether insn is indirect jump */
-static int __insn_is_indirect_jump(struct insn *insn)
+static int insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -260,26 +260,6 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
return (start <= target && target <= start + len);
}
-static int insn_is_indirect_jump(struct insn *insn)
-{
- int ret = __insn_is_indirect_jump(insn);
-
-#ifdef CONFIG_RETPOLINE
- /*
- * Jump to x86_indirect_thunk_* is treated as an indirect jump.
- * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
- * older gcc may use indirect jump. So we add this check instead of
- * replace indirect-jump check.
- */
- if (!ret)
- ret = insn_jump_into_range(insn,
- (unsigned long)__indirect_thunk_start,
- (unsigned long)__indirect_thunk_end -
- (unsigned long)__indirect_thunk_start);
-#endif
- return ret;
-}
-
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -334,9 +314,21 @@ static int can_optimize(unsigned long paddr)
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
- /* Check any instructions don't jump into target */
- if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ /*
+ * Check any instructions don't jump into target, indirectly or
+ * directly.
+ *
+ * The indirect case is present to handle a code with jump
+ * tables. When the kernel uses retpolines, the check should in
+ * theory additionally look for jumps to indirect thunks.
+ * However, the kernel built with retpolines or IBT has jump
+ * tables disabled so the check can be skipped altogether.
+ */
+ if (!IS_ENABLED(CONFIG_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
+ insn_is_indirect_jump(&insn))
+ return 0;
+ if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
DISP32_SIZE))
return 0;
addr += insn.length;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b05f62ee2344..5f71a0cf4399 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -358,7 +358,7 @@ int module_finalize(const Elf_Ehdr *hdr,
}
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
- apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
}
if (locks) {
void *lseg = (void *)locks->sh_addr;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff9b80a0e3e3..72015dba72ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame = &fork_frame->frame;
frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index b70670a98597..77a9316da435 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -186,6 +186,19 @@ EXPORT_SYMBOL_GPL(arch_static_call_transform);
*/
bool __static_call_fixup(void *tramp, u8 op, void *dest)
{
+ unsigned long addr = (unsigned long)tramp;
+ /*
+ * Not all .return_sites are a static_call trampoline (most are not).
+ * Check if the 3 bytes after the return are still kernel text, if not,
+ * then this definitely is not a trampoline and we need not worry
+ * further.
+ *
+ * This avoids the memcmp() below tripping over pagefaults etc..
+ */
+ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) &&
+ !kernel_text_address(addr + 7))
+ return false;
+
if (memcmp(tramp+5, tramp_ud, 3)) {
/* Not a trampoline site, not our problem. */
return false;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 58b1f208eff5..4a817d20ce3b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -697,9 +697,10 @@ static bool try_fixup_enqcmd_gp(void)
}
static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
- unsigned long error_code, const char *str)
+ unsigned long error_code, const char *str,
+ unsigned long address)
{
- if (fixup_exception(regs, trapnr, error_code, 0))
+ if (fixup_exception(regs, trapnr, error_code, address))
return true;
current->thread.error_code = error_code;
@@ -759,7 +760,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
goto exit;
}
- if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0))
goto exit;
if (error_code)
@@ -1357,17 +1358,20 @@ DEFINE_IDTENTRY(exc_device_not_available)
#define VE_FAULT_STR "VE fault"
-static void ve_raise_fault(struct pt_regs *regs, long error_code)
+static void ve_raise_fault(struct pt_regs *regs, long error_code,
+ unsigned long address)
{
if (user_mode(regs)) {
gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
return;
}
- if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code,
+ VE_FAULT_STR, address)) {
return;
+ }
- die_addr(VE_FAULT_STR, regs, error_code, 0);
+ die_addr(VE_FAULT_STR, regs, error_code, address);
}
/*
@@ -1431,7 +1435,7 @@ DEFINE_IDTENTRY(exc_virtualization_exception)
* it successfully, treat it as #GP(0) and handle it.
*/
if (!tdx_handle_virt_exception(regs, &ve))
- ve_raise_fault(regs, 0);
+ ve_raise_fault(regs, 0, ve.gla);
cond_local_irq_disable(regs);
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 03c885d3640f..83d41c2601d7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -133,14 +133,26 @@ SECTIONS
KPROBES_TEXT
SOFTIRQENTRY_TEXT
#ifdef CONFIG_RETPOLINE
- __indirect_thunk_start = .;
- *(.text.__x86.*)
- __indirect_thunk_end = .;
+ *(.text..__x86.indirect_thunk)
+ *(.text..__x86.return_thunk)
#endif
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
+#ifdef CONFIG_CPU_SRSO
+ *(.text..__x86.rethunk_untrain)
+#endif
+
ENTRY_TEXT
+
+#ifdef CONFIG_CPU_SRSO
+ /*
+ * See the comment above srso_alias_untrain_ret()'s
+ * definition.
+ */
+ . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+ *(.text..__x86.rethunk_safe)
+#endif
ALIGN_ENTRY_TEXT_END
*(.gnu.warning)
@@ -509,7 +521,24 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif
#ifdef CONFIG_RETHUNK
-. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
+. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
+#endif
+
+#ifdef CONFIG_CPU_SRSO
+/*
+ * GNU ld cannot do XOR until 2.41.
+ * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
+ *
+ * LLVM lld cannot do XOR until lld-17.
+ * https://github.com/llvm/llvm-project/commit/fae96104d4378166cbe5c875ef8ed808a356f3fb
+ *
+ * Instead do: (A | B) - (A & B) in order to compute the XOR
+ * of the two function addresses:
+ */
+. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) -
+ (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+ "SRSO function pair won't alias");
#endif
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7f4d13383cf2..d3432687c9e6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -729,6 +729,9 @@ void kvm_set_cpu_caps(void)
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+ kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 113ca9661ab2..a983a16163b1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -637,16 +637,22 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
*max_irr = -1;
for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = *p_irr;
pir_val = READ_ONCE(pir[i]);
- irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+
if (pir_val) {
+ pir_val = xchg(&pir[i], 0);
+
prev_irr_val = irr_val;
- irr_val |= xchg(&pir[i], 0);
- *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
- if (prev_irr_val != irr_val) {
- max_updated_irr =
- __fls(irr_val ^ prev_irr_val) + vec;
- }
+ do {
+ irr_val = prev_irr_val | pir_val;
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
}
if (irr_val)
*max_irr = __fls(irr_val) + vec;
@@ -660,8 +666,11 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
- return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 07756b7348ae..d3aec1f2cad2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2417,15 +2417,18 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
*/
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb);
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
- svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
- if (ghcb_xcr0_is_valid(ghcb)) {
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
+
+ if (kvm_ghcb_xcr0_is_valid(svm)) {
vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
kvm_update_cpuid_runtime(vcpu);
}
@@ -2436,84 +2439,88 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
control->exit_code_hi = upper_32_bits(exit_code);
control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
/* Clear the valid entries fields */
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
- struct kvm_vcpu *vcpu;
- struct ghcb *ghcb;
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
u64 exit_code;
u64 reason;
- ghcb = svm->sev_es.ghcb;
-
/*
* Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
- exit_code = ghcb_get_sw_exit_code(ghcb);
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
/* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage) {
+ if (svm->sev_es.ghcb->ghcb_usage) {
reason = GHCB_ERR_INVALID_USAGE;
goto vmgexit_err;
}
reason = GHCB_ERR_MISSING_INPUT;
- if (!ghcb_sw_exit_code_is_valid(ghcb) ||
- !ghcb_sw_exit_info_1_is_valid(ghcb) ||
- !ghcb_sw_exit_info_2_is_valid(ghcb))
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
goto vmgexit_err;
- switch (ghcb_get_sw_exit_code(ghcb)) {
+ switch (exit_code) {
case SVM_EXIT_READ_DR7:
break;
case SVM_EXIT_WRITE_DR7:
- if (!ghcb_rax_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSC:
break;
case SVM_EXIT_RDPMC:
- if (!ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_CPUID:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
- if (ghcb_get_rax(ghcb) == 0xd)
- if (!ghcb_xcr0_is_valid(ghcb))
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_INVD:
break;
case SVM_EXIT_IOIO:
- if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) {
- if (!ghcb_sw_scratch_is_valid(ghcb))
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
} else {
- if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK))
- if (!ghcb_rax_is_valid(ghcb))
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_MSR:
- if (!ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
- if (ghcb_get_sw_exit_info_1(ghcb)) {
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rdx_is_valid(ghcb))
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_VMMCALL:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_cpl_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSCP:
@@ -2521,19 +2528,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_EXIT_WBINVD:
break;
case SVM_EXIT_MONITOR:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb) ||
- !ghcb_rdx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_MWAIT:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_MMIO_READ:
case SVM_VMGEXIT_MMIO_WRITE:
- if (!ghcb_sw_scratch_is_valid(ghcb))
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_NMI_COMPLETE:
@@ -2549,11 +2556,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
return 0;
vmgexit_err:
- vcpu = &svm->vcpu;
-
if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
- ghcb->ghcb_usage);
+ svm->sev_es.ghcb->ghcb_usage);
} else if (reason == GHCB_ERR_INVALID_EVENT) {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
exit_code);
@@ -2563,11 +2568,8 @@ vmgexit_err:
dump_ghcb(svm);
}
- /* Clear the valid entries fields */
- memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, reason);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason);
/* Resume the guest to "return" the error code. */
return 1;
@@ -2586,7 +2588,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
*/
if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
- ghcb_get_sw_scratch(svm->sev_es.ghcb),
+ svm->sev_es.sw_scratch,
svm->sev_es.ghcb_sa,
svm->sev_es.ghcb_sa_len);
svm->sev_es.ghcb_sa_sync = false;
@@ -2632,12 +2634,11 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
- struct ghcb *ghcb = svm->sev_es.ghcb;
u64 ghcb_scratch_beg, ghcb_scratch_end;
u64 scratch_gpa_beg, scratch_gpa_end;
void *scratch_va;
- scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
goto e_scratch;
@@ -2708,8 +2709,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
return 0;
e_scratch:
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
return 1;
}
@@ -2822,7 +2823,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
- struct ghcb *ghcb;
int ret;
/* Validate the GHCB */
@@ -2847,20 +2847,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
- ghcb = svm->sev_es.ghcb_map.hva;
- trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
-
- exit_code = ghcb_get_sw_exit_code(ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
+ sev_es_sync_from_ghcb(svm);
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
- sev_es_sync_from_ghcb(svm);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0);
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
@@ -2898,13 +2896,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
break;
case 1:
/* Get AP jump table address */
- ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table);
break;
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
}
ret = 1;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d381ad424554..d4bfdc607fe7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1498,7 +1498,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
- indirect_branch_prediction_barrier();
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
@@ -1786,6 +1788,11 @@ static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
}
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3986,14 +3993,8 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
- /*
- * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
- * can't read guest memory (dereference memslots) to decode the WRMSR.
- */
- if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
- nrips && control->next_rip)
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
return EXIT_FASTPATH_NONE;
@@ -4005,6 +4006,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ amd_clear_divider();
+
if (sev_es_guest(vcpu->kvm))
__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
@@ -4815,6 +4818,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
.set_cr0 = svm_set_cr0,
.post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 18af7e712a5a..8239c8de45ac 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -190,10 +190,12 @@ struct vcpu_sev_es_state {
/* SEV-ES support */
struct sev_es_save_area *vmsa;
struct ghcb *ghcb;
+ u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
/* SEV-ES scratch area support */
+ u64 sw_scratch;
void *ghcb_sa;
u32 ghcb_sa_len;
bool ghcb_sa_sync;
@@ -744,4 +746,28 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+ static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ { \
+ return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ } \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 8e8295e774f0..ef2ebabb059c 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -222,7 +222,7 @@ SYM_FUNC_START(__svm_vcpu_run)
* because interrupt handlers won't sanitize 'ret' if the return is
* from the kernel.
*/
- UNTRAIN_RET
+ UNTRAIN_RET_VM
/*
* Clear all general purpose registers except RSP and RAX to prevent
@@ -359,7 +359,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
* because interrupt handlers won't sanitize RET if the return is
* from the kernel.
*/
- UNTRAIN_RET
+ UNTRAIN_RET_VM
/* "Pop" @spec_ctrl_intercepted. */
pop %_ASM_BX
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 07e927d4d099..be275a0410a8 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -303,10 +303,8 @@ SYM_FUNC_START(vmx_do_nmi_irqoff)
VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
SYM_FUNC_END(vmx_do_nmi_irqoff)
-
-.section .text, "ax"
-
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
@@ -335,7 +333,7 @@ SYM_FUNC_START(vmread_error_trampoline)
mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2
mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1
- call vmread_error
+ call vmread_error_trampoline2
/* Zero out @fault, which will be popped into the result register. */
_ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
@@ -357,6 +355,8 @@ SYM_FUNC_START(vmread_error_trampoline)
SYM_FUNC_END(vmread_error_trampoline)
#endif
+.section .text, "ax"
+
SYM_FUNC_START(vmx_do_interrupt_irqoff)
VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0ecf4be2c6af..df461f387e20 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -441,13 +441,23 @@ do { \
pr_warn_ratelimited(fmt); \
} while (0)
-void vmread_error(unsigned long field, bool fault)
+noinline void vmread_error(unsigned long field)
{
- if (fault)
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
kvm_spurious_fault();
- else
- vmx_insn_failed("vmread failed: field=%lx\n", field);
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
}
+#endif
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
@@ -1503,6 +1513,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
@@ -3037,6 +3052,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
@@ -3226,6 +3250,17 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
+static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
+
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
+}
+
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3235,7 +3270,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -3263,7 +3298,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ if (enable_ept && !enable_unrestricted_guest) {
/*
* Ensure KVM has an up-to-date snapshot of the guest's CR3. If
* the below code _enables_ CR3 exiting, vmx_cache_reg() will
@@ -3394,7 +3429,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* this bit, even if host CR4.MCE == 0.
*/
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3414,7 +3449,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!is_unrestricted_guest(vcpu)) {
+ if (!enable_unrestricted_guest) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
@@ -4651,7 +4686,8 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
if (kvm_vmx->pid_table)
return 0;
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
if (!pages)
return -ENOMEM;
@@ -5364,18 +5400,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_guest_cr0_valid(vcpu, val))
- return 1;
-
if (kvm_set_cr0(vcpu, val))
return 1;
vmcs_writel(CR0_READ_SHADOW, orig_val);
return 0;
} else {
- if (to_vmx(vcpu)->nested.vmxon &&
- !nested_host_cr0_valid(vcpu, val))
- return 1;
-
return kvm_set_cr0(vcpu, val);
}
}
@@ -8203,6 +8232,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
.set_cr0 = vmx_set_cr0,
.is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ce47dc265f89..33af7b4c6eb4 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -10,7 +10,7 @@
#include "vmcs.h"
#include "../x86.h"
-void vmread_error(unsigned long field, bool fault);
+void vmread_error(unsigned long field);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
@@ -31,6 +31,13 @@ void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
* void vmread_error_trampoline(unsigned long field, bool fault);
*/
extern unsigned long vmread_error_trampoline;
+
+/*
+ * The second VMREAD error trampoline, called from the assembly trampoline,
+ * exists primarily to enable instrumentation for the VM-Fail path.
+ */
+void vmread_error_trampoline2(unsigned long field, bool fault);
+
#endif
static __always_inline void vmcs_check16(unsigned long field)
@@ -101,8 +108,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
do_fail:
instrumentation_begin();
- WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
- pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ vmread_error(field);
instrumentation_end();
return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6b9bea62fb8..c381770bcbf1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -906,6 +906,22 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
}
EXPORT_SYMBOL_GPL(load_pdptrs);
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return false;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
+
+ return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+}
+
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
/*
@@ -952,20 +968,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- cr0 |= X86_CR0_ET;
-
-#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
+ if (!kvm_is_valid_cr0(vcpu, cr0))
return 1;
-#endif
-
- cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
+ cr0 |= X86_CR0_ET;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
@@ -1607,7 +1616,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1664,6 +1673,9 @@ static u64 kvm_get_arch_capabilities(void)
*/
}
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
return data;
}
@@ -2172,6 +2184,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
u64 data;
fastpath_t ret = EXIT_FASTPATH_NONE;
+ kvm_vcpu_srcu_read_lock(vcpu);
+
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
@@ -2194,6 +2208,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
if (ret != EXIT_FASTPATH_NONE)
trace_kvm_msr_write(msr, data);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
@@ -10203,9 +10219,13 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (r < 0)
goto out;
if (r) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ int irq = kvm_cpu_get_interrupt(vcpu);
+
+ if (!WARN_ON_ONCE(irq == -1)) {
+ kvm_queue_interrupt(vcpu, irq, false);
+ static_call(kvm_x86_inject_irq)(vcpu, false);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ }
}
if (kvm_cpu_has_injectable_intr(vcpu))
static_call(kvm_x86_enable_irq_window)(vcpu);
@@ -11460,7 +11480,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return false;
}
- return kvm_is_valid_cr4(vcpu, sregs->cr4);
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
}
static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
@@ -13185,7 +13206,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
bool kvm_arch_has_irq_bypass(void)
{
- return true;
+ return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 3fd066d42ec0..cd86aeb5fdd3 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -11,8 +11,9 @@
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
#include <asm/frame.h>
+#include <asm/nops.h>
- .section .text.__x86.indirect_thunk
+ .section .text..__x86.indirect_thunk
.macro POLINE reg
@@ -131,36 +132,107 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
*/
#ifdef CONFIG_RETHUNK
- .section .text.__x86.return_thunk
+/*
+ * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at
+ * special addresses:
+ *
+ * - srso_alias_untrain_ret() is 2M aligned
+ * - srso_alias_safe_ret() is also in the same 2M page but bits 2, 8, 14
+ * and 20 in its virtual address are set (while those bits in the
+ * srso_alias_untrain_ret() function are cleared).
+ *
+ * This guarantees that those two addresses will alias in the branch
+ * target buffer of Zen3/4 generations, leading to any potential
+ * poisoned entries at that BTB slot to get evicted.
+ *
+ * As a result, srso_alias_safe_ret() becomes a safe return.
+ */
+#ifdef CONFIG_CPU_SRSO
+ .section .text..__x86.rethunk_untrain
+
+SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ASM_NOP2
+ lfence
+ jmp srso_alias_return_thunk
+SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
+
+ .section .text..__x86.rethunk_safe
+#else
+/* dummy definition for alternatives */
+SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(srso_alias_untrain_ret)
+#endif
+
+SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ lea 8(%_ASM_SP), %_ASM_SP
+ UNWIND_HINT_FUNC
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(srso_alias_safe_ret)
+
+ .section .text..__x86.return_thunk
+
+SYM_CODE_START(srso_alias_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_alias_safe_ret
+ ud2
+SYM_CODE_END(srso_alias_return_thunk)
+
+/*
+ * Some generic notes on the untraining sequences:
+ *
+ * They are interchangeable when it comes to flushing potentially wrong
+ * RET predictions from the BTB.
+ *
+ * The SRSO Zen1/2 (MOVABS) untraining sequence is longer than the
+ * Retbleed sequence because the return sequence done there
+ * (srso_safe_ret()) is longer and the return sequence must fully nest
+ * (end before) the untraining sequence. Therefore, the untraining
+ * sequence must fully overlap the return sequence.
+ *
+ * Regarding alignment - the instructions which need to be untrained,
+ * must all start at a cacheline boundary for Zen1/2 generations. That
+ * is, instruction sequences starting at srso_safe_ret() and
+ * the respective instruction sequences at retbleed_return_thunk()
+ * must start at a cacheline boundary.
+ */
/*
* Safety details here pertain to the AMD Zen{1,2} microarchitecture:
- * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for
+ * 1) The RET at retbleed_return_thunk must be on a 64 byte boundary, for
* alignment within the BTB.
- * 2) The instruction at zen_untrain_ret must contain, and not
+ * 2) The instruction at retbleed_untrain_ret must contain, and not
* end with, the 0xc3 byte of the RET.
* 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc
-SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc
+SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
- * As executed from zen_untrain_ret, this is:
+ * As executed from retbleed_untrain_ret, this is:
*
* TEST $0xcc, %bl
* LFENCE
- * JMP __x86_return_thunk
+ * JMP retbleed_return_thunk
*
* Executing the TEST instruction has a side effect of evicting any BTB
* prediction (potentially attacker controlled) attached to the RET, as
- * __x86_return_thunk + 1 isn't an instruction boundary at the moment.
+ * retbleed_return_thunk + 1 isn't an instruction boundary at the moment.
*/
.byte 0xf6
/*
- * As executed from __x86_return_thunk, this is a plain RET.
+ * As executed from retbleed_return_thunk, this is a plain RET.
*
* As part of the TEST above, RET is the ModRM byte, and INT3 the imm8.
*
@@ -172,13 +244,13 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
* With SMT enabled and STIBP active, a sibling thread cannot poison
* RET's prediction to a type of its choice, but can evict the
* prediction due to competitive sharing. If the prediction is
- * evicted, __x86_return_thunk will suffer Straight Line Speculation
+ * evicted, retbleed_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
+SYM_INNER_LABEL(retbleed_return_thunk, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__x86_return_thunk)
+SYM_CODE_END(retbleed_return_thunk)
/*
* Ensure the TEST decoding / BTB invalidation is complete.
@@ -189,11 +261,67 @@ SYM_CODE_END(__x86_return_thunk)
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __x86_return_thunk
+ jmp retbleed_return_thunk
int3
-SYM_FUNC_END(zen_untrain_ret)
-__EXPORT_THUNK(zen_untrain_ret)
+SYM_FUNC_END(retbleed_untrain_ret)
+__EXPORT_THUNK(retbleed_untrain_ret)
+/*
+ * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
+ * above. On kernel entry, srso_untrain_ret() is executed which is a
+ *
+ * movabs $0xccccc30824648d48,%rax
+ *
+ * and when the return thunk executes the inner label srso_safe_ret()
+ * later, it is a stack manipulation and a RET which is mispredicted and
+ * thus a "safe" one to use.
+ */
+ .align 64
+ .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
+SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_NOENDBR
+ .byte 0x48, 0xb8
+
+/*
+ * This forces the function return instruction to speculate into a trap
+ * (UD2 in srso_return_thunk() below). This RET will then mispredict
+ * and execution will continue at the return site read from the top of
+ * the stack.
+ */
+SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
+ lea 8(%_ASM_SP), %_ASM_SP
+ ret
+ int3
+ int3
+ /* end of movabs */
+ lfence
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_safe_ret)
+SYM_FUNC_END(srso_untrain_ret)
+__EXPORT_THUNK(srso_untrain_ret)
+
+SYM_CODE_START(srso_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_return_thunk)
+
+SYM_FUNC_START(entry_untrain_ret)
+ ALTERNATIVE_2 "jmp retbleed_untrain_ret", \
+ "jmp srso_untrain_ret", X86_FEATURE_SRSO, \
+ "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
+SYM_FUNC_END(entry_untrain_ret)
+__EXPORT_THUNK(entry_untrain_ret)
+
+SYM_CODE_START(__x86_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_RETHUNK */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 643d02900fbb..a0ea285878db 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -90,30 +90,35 @@ SYM_CODE_END(xen_cpu_bringup_again)
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
-#ifdef CONFIG_X86_32
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
-#else
+#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
-#endif
-#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
-#endif
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
- .ascii "!writable_page_tables|pae_pgdir_above_4gb")
- ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
- .long (1 << XENFEAT_writable_page_tables) | \
- (1 << XENFEAT_dom0) | \
- (1 << XENFEAT_linux_rsdp_unrestricted))
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
- ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
- ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
- ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
+# define FEATURES_PV (1 << XENFEAT_writable_page_tables)
+#else
+# define FEATURES_PV 0
+#endif
+#ifdef CONFIG_XEN_PVH
+# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted)
+#else
+# define FEATURES_PVH 0
+#endif
+#ifdef CONFIG_XEN_DOM0
+# define FEATURES_DOM0 (1 << XENFEAT_dom0)
+#else
+# define FEATURES_DOM0 0
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+ .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0)
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
#endif /*CONFIG_XEN */