summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S107
-rw-r--r--arch/x86/boot/compressed/head_32.S32
-rw-r--r--arch/x86/boot/compressed/head_64.S280
-rw-r--r--arch/x86/boot/compressed/misc.c44
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pgtable.h10
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c87
-rw-r--r--arch/x86/boot/compressed/sev.c112
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/include/asm/boot.h8
-rw-r--r--arch/x86/include/asm/efi.h7
-rw-r--r--arch/x86/include/asm/entry-common.h1
-rw-r--r--arch/x86/include/asm/nospec-branch.h49
-rw-r--r--arch/x86/include/asm/sev.h6
-rw-r--r--arch/x86/include/asm/topology.h4
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c1
-rw-r--r--arch/x86/kernel/cpu/bugs.c27
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c35
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c19
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h6
-rw-r--r--arch/x86/kernel/fpu/context.h3
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c7
-rw-r--r--arch/x86/kernel/head_64.S32
-rw-r--r--arch/x86/kernel/kprobes/opt.c40
-rw-r--r--arch/x86/kernel/paravirt.c8
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/static_call.c13
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S20
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/svm/vmenter.S7
-rw-r--r--arch/x86/lib/retpoline.S137
39 files changed, 580 insertions, 561 deletions
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 40d2ff503079..71fc531b95b4 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y)
LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
endif
+ifeq ($(CONFIG_EFI_STUB),y)
+# ensure that the static EFI stub library will be pulled in, even if it is
+# never referenced explicitly from the startup code
+LDFLAGS_vmlinux += -u efi_pe_entry
+endif
LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
index 4ca70bf93dc0..f4e22ef774ab 100644
--- a/arch/x86/boot/compressed/efi_mixed.S
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -26,8 +26,8 @@
* When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
* is the first thing that runs after switching to long mode. Depending on
* whether the EFI handover protocol or the compat entry point was used to
- * enter the kernel, it will either branch to the 64-bit EFI handover
- * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF
+ * enter the kernel, it will either branch to the common 64-bit EFI stub
+ * entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF
* entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
* struct bootparams pointer as the third argument, so the presence of such a
* pointer is used to disambiguate.
@@ -37,21 +37,23 @@
* | efi32_pe_entry |---->| | | +-----------+--+
* +------------------+ | | +------+----------------+ |
* | startup_32 |---->| startup_64_mixed_mode | |
- * +------------------+ | | +------+----------------+ V
- * | efi32_stub_entry |---->| | | +------------------+
- * +------------------+ +------------+ +---->| efi64_stub_entry |
- * +-------------+----+
- * +------------+ +----------+ |
- * | startup_64 |<----| efi_main |<--------------+
- * +------------+ +----------+
+ * +------------------+ | | +------+----------------+ |
+ * | efi32_stub_entry |---->| | | |
+ * +------------------+ +------------+ | |
+ * V |
+ * +------------+ +----------------+ |
+ * | startup_64 |<----| efi_stub_entry |<--------+
+ * +------------+ +----------------+
*/
SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
- jnz efi64_stub_entry
+ jnz efi_stub_entry
+#endif
/*
* efi_pe_entry uses MS calling convention, which requires 32 bytes of
* shadow space on the stack even if all arguments are passed in
@@ -138,6 +140,28 @@ SYM_FUNC_START(__efi64_thunk)
SYM_FUNC_END(__efi64_thunk)
.code32
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+SYM_FUNC_START(efi32_stub_entry)
+ call 1f
+1: popl %ecx
+
+ /* Clear BSS */
+ xorl %eax, %eax
+ leal (_bss - 1b)(%ecx), %edi
+ leal (_ebss - 1b)(%ecx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ cld
+ rep stosl
+
+ add $0x4, %esp /* Discard return address */
+ popl %ecx
+ popl %edx
+ popl %esi
+ jmp efi32_entry
+SYM_FUNC_END(efi32_stub_entry)
+#endif
+
/*
* EFI service pointer must be in %edi.
*
@@ -218,7 +242,7 @@ SYM_FUNC_END(efi_enter32)
* stub may still exit and return to the firmware using the Exit() EFI boot
* service.]
*/
-SYM_FUNC_START(efi32_entry)
+SYM_FUNC_START_LOCAL(efi32_entry)
call 1f
1: pop %ebx
@@ -245,10 +269,6 @@ SYM_FUNC_START(efi32_entry)
jmp startup_32
SYM_FUNC_END(efi32_entry)
-#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
-#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
-#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
-
/*
* efi_status_t efi32_pe_entry(efi_handle_t image_handle,
* efi_system_table_32_t *sys_table)
@@ -256,8 +276,6 @@ SYM_FUNC_END(efi32_entry)
SYM_FUNC_START(efi32_pe_entry)
pushl %ebp
movl %esp, %ebp
- pushl %eax // dummy push to allocate loaded_image
-
pushl %ebx // save callee-save registers
pushl %edi
@@ -266,48 +284,8 @@ SYM_FUNC_START(efi32_pe_entry)
movl $0x80000003, %eax // EFI_UNSUPPORTED
jnz 2f
- call 1f
-1: pop %ebx
-
- /* Get the loaded image protocol pointer from the image handle */
- leal -4(%ebp), %eax
- pushl %eax // &loaded_image
- leal (loaded_image_proto - 1b)(%ebx), %eax
- pushl %eax // pass the GUID address
- pushl 8(%ebp) // pass the image handle
-
- /*
- * Note the alignment of the stack frame.
- * sys_table
- * handle <-- 16-byte aligned on entry by ABI
- * return address
- * frame pointer
- * loaded_image <-- local variable
- * saved %ebx <-- 16-byte aligned here
- * saved %edi
- * &loaded_image
- * &loaded_image_proto
- * handle <-- 16-byte aligned for call to handle_protocol
- */
-
- movl 12(%ebp), %eax // sys_table
- movl ST32_boottime(%eax), %eax // sys_table->boottime
- call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
- addl $12, %esp // restore argument space
- testl %eax, %eax
- jnz 2f
-
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
- movl -4(%ebp), %esi // loaded_image
- movl LI32_image_base(%esi), %esi // loaded_image->image_base
- leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
- /*
- * We need to set the image_offset variable here since startup_32() will
- * use it before we get to the 64-bit efi_pe_entry() in C code.
- */
- subl %esi, %ebp // calculate image_offset
- movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
xorl %esi, %esi
jmp efi32_entry // pass %ecx, %edx, %esi
// no other registers remain live
@@ -318,14 +296,13 @@ SYM_FUNC_START(efi32_pe_entry)
RET
SYM_FUNC_END(efi32_pe_entry)
- .section ".rodata"
- /* EFI loaded image protocol GUID */
- .balign 4
-SYM_DATA_START_LOCAL(loaded_image_proto)
- .long 0x5b1b31a1
- .word 0x9562, 0x11d2
- .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
-SYM_DATA_END(loaded_image_proto)
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+ .org efi32_stub_entry + 0x200
+ .code64
+SYM_FUNC_START_NOALIGN(efi64_stub_entry)
+ jmp efi_handover_entry
+SYM_FUNC_END(efi64_stub_entry)
+#endif
.data
.balign 8
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 987ae727cf9f..1cfe9802a42f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
leal startup_32@GOTOFF(%edx), %ebx
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32() will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry() will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- subl image_offset@GOTOFF(%edx), %ebx
-#endif
-
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32)
jmp *%eax
SYM_FUNC_END(startup_32)
-#ifdef CONFIG_EFI_STUB
-SYM_FUNC_START(efi32_stub_entry)
- add $0x4, %esp
- movl 8(%esp), %esi /* save boot_params pointer */
- call efi_main
- /* efi_main returns the possibly relocated address of startup_32 */
- jmp *%eax
-SYM_FUNC_END(efi32_stub_entry)
-SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry)
-#endif
-
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@@ -179,13 +155,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
*/
/* push arguments for extract_kernel: */
- pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */
pushl %ebp /* output address */
- pushl input_len@GOTOFF(%ebx) /* input_len */
- leal input_data@GOTOFF(%ebx), %eax
- pushl %eax /* input_data */
- leal boot_heap@GOTOFF(%ebx), %eax
- pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp
@@ -213,8 +183,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 03c4328a88cb..bf4a10a5794f 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -146,19 +146,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32 will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- subl rva(image_offset)(%ebp), %ebx
-#endif
-
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@@ -294,17 +281,6 @@ SYM_FUNC_START(startup_32)
lret
SYM_FUNC_END(startup_32)
-#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
- .org 0x190
-SYM_FUNC_START(efi32_stub_entry)
- add $0x4, %esp /* Discard return address */
- popl %ecx
- popl %edx
- popl %esi
- jmp efi32_entry
-SYM_FUNC_END(efi32_stub_entry)
-#endif
-
.code64
.org 0x200
SYM_CODE_START(startup_64)
@@ -346,20 +322,6 @@ SYM_CODE_START(startup_64)
/* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32 will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- movl image_offset(%rip), %eax
- subq %rax, %rbp
-#endif
-
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
@@ -398,10 +360,6 @@ SYM_CODE_START(startup_64)
* For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode.
- *
- * We go though the trampoline even if we don't have to: if we're
- * already in a desired paging mode. This way the trampoline code gets
- * tested on every boot.
*/
/* Make sure we have GDT with 32-bit code segment */
@@ -416,10 +374,14 @@ SYM_CODE_START(startup_64)
lretq
.Lon_kernel_cs:
+ /*
+ * RSI holds a pointer to a boot_params structure provided by the
+ * loader, and this needs to be preserved across C function calls. So
+ * move it into a callee saved register.
+ */
+ movq %rsi, %r15
- pushq %rsi
call load_stage1_idt
- popq %rsi
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
@@ -430,63 +392,24 @@ SYM_CODE_START(startup_64)
* CPUID instructions being issued, so go ahead and do that now via
* sev_enable(), which will also handle the rest of the SEV-related
* detection/setup to ensure that has been done in advance of any dependent
- * code.
+ * code. Pass the boot_params pointer as the first argument.
*/
- pushq %rsi
- movq %rsi, %rdi /* real mode address */
+ movq %r15, %rdi
call sev_enable
- popq %rsi
#endif
/*
- * paging_prepare() sets up the trampoline and checks if we need to
- * enable 5-level paging.
+ * configure_5level_paging() updates the number of paging levels using
+ * a trampoline in 32-bit addressable memory if the current number does
+ * not match the desired number.
*
- * paging_prepare() returns a two-quadword structure which lands
- * into RDX:RAX:
- * - Address of the trampoline is returned in RAX.
- * - Non zero RDX means trampoline needs to enable 5-level
- * paging.
- *
- * RSI holds real mode data and needs to be preserved across
- * this function call.
- */
- pushq %rsi
- movq %rsi, %rdi /* real mode address */
- call paging_prepare
- popq %rsi
-
- /* Save the trampoline address in RCX */
- movq %rax, %rcx
-
- /*
- * Load the address of trampoline_return() into RDI.
- * It will be used by the trampoline to return to the main code.
+ * Pass the boot_params pointer as the first argument. The second
+ * argument is the relocated address of the page table to use instead
+ * of the page table in trampoline memory (if required).
*/
- leaq trampoline_return(%rip), %rdi
-
- /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
- pushq $__KERNEL32_CS
- leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
- pushq %rax
- lretq
-trampoline_return:
- /* Restore the stack, the 32-bit trampoline uses its own stack */
- leaq rva(boot_stack_end)(%rbx), %rsp
-
- /*
- * cleanup_trampoline() would restore trampoline memory.
- *
- * RDI is address of the page table to use instead of page table
- * in trampoline memory (if required).
- *
- * RSI holds real mode data and needs to be preserved across
- * this function call.
- */
- pushq %rsi
- leaq rva(top_pgtable)(%rbx), %rdi
- call cleanup_trampoline
- popq %rsi
+ movq %r15, %rdi
+ leaq rva(top_pgtable)(%rbx), %rsi
+ call configure_5level_paging
/* Zero EFLAGS */
pushq $0
@@ -496,7 +419,6 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi
movl $(_bss - startup_32), %ecx
@@ -504,7 +426,6 @@ trampoline_return:
std
rep movsq
cld
- popq %rsi
/*
* The GDT may get overwritten either during the copy we just did or
@@ -523,21 +444,6 @@ trampoline_return:
jmp *%rax
SYM_CODE_END(startup_64)
-#ifdef CONFIG_EFI_STUB
-#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
- .org 0x390
-#endif
-SYM_FUNC_START(efi64_stub_entry)
- and $~0xf, %rsp /* realign the stack */
- movq %rdx, %rbx /* save boot_params pointer */
- call efi_main
- movq %rbx,%rsi
- leaq rva(startup_64)(%rax), %rax
- jmp *%rax
-SYM_FUNC_END(efi64_stub_entry)
-SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry)
-#endif
-
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@@ -551,128 +457,122 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx
rep stosq
- pushq %rsi
call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */
- movq (%rsp), %rdi
+ movq %r15, %rdi
call initialize_identity_maps
- popq %rsi
/*
* Do the extraction, and jump to the new kernel..
*/
- pushq %rsi /* Save the real mode argument */
- movq %rsi, %rdi /* real mode address */
- leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
- leaq input_data(%rip), %rdx /* input_data */
- movl input_len(%rip), %ecx /* input_len */
- movq %rbp, %r8 /* output target address */
- movl output_len(%rip), %r9d /* decompressed length, end of relocs */
+ /* pass struct boot_params pointer and output target address */
+ movq %r15, %rdi
+ movq %rbp, %rsi
call extract_kernel /* returns kernel entry point in %rax */
- popq %rsi
/*
* Jump to the decompressed kernel.
*/
+ movq %r15, %rsi
jmp *%rax
SYM_FUNC_END(.Lrelocated)
- .code32
/*
- * This is the 32-bit trampoline that will be copied over to low memory.
+ * This is the 32-bit trampoline that will be copied over to low memory. It
+ * will be called using the ordinary 64-bit calling convention from code
+ * running in 64-bit mode.
*
- * RDI contains the return address (might be above 4G).
- * ECX contains the base address of the trampoline memory.
- * Non zero RDX means trampoline needs to enable 5-level paging.
+ * Return address is at the top of the stack (might be above 4G).
+ * The first argument (EDI) contains the address of the temporary PGD level
+ * page table in 32-bit addressable memory which will be programmed into
+ * register CR3.
*/
+ .section ".rodata", "a", @progbits
SYM_CODE_START(trampoline_32bit_src)
- /* Set up data and stack segments */
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %ss
+ /*
+ * Preserve callee save 64-bit registers on the stack: this is
+ * necessary because the architecture does not guarantee that GPRs will
+ * retain their full 64-bit values across a 32-bit mode switch.
+ */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+
+ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
+ movq %rsp, %rbx
+ shrq $32, %rbx
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq 0f(%rip), %rax
+ pushq %rax
+ lretq
- /* Set up new stack */
- leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
+ /*
+ * The 32-bit code below will do a far jump back to long mode and end
+ * up here after reconfiguring the number of paging levels. First, the
+ * stack pointer needs to be restored to its full 64-bit value before
+ * the callee save register contents can be popped from the stack.
+ */
+.Lret:
+ shlq $32, %rbx
+ orq %rbx, %rsp
+
+ /* Restore the preserved 64-bit registers */
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ retq
+ .code32
+0:
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
- /* Check what paging mode we want to be in after the trampoline */
- testl %edx, %edx
- jz 1f
-
- /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
- movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jnz 3f
- jmp 2f
-1:
- /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
- movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jz 3f
-2:
/* Point CR3 to the trampoline's new top level page table */
- leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
- movl %eax, %cr3
-3:
+ movl %edi, %cr3
+
/* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
- pushl %ecx
- pushl %edx
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
/* Avoid writing EFER if no change was made (for TDX guest) */
jc 1f
wrmsr
-1: popl %edx
- popl %ecx
-
-#ifdef CONFIG_X86_MCE
- /*
- * Preserve CR4.MCE if the kernel will enable #MC support.
- * Clearing MCE may fault in some environments (that also force #MC
- * support). Any machine check that occurs before #MC support is fully
- * configured will crash the system regardless of the CR4.MCE value set
- * here.
- */
- movl %cr4, %eax
- andl $X86_CR4_MCE, %eax
-#else
- movl $0, %eax
-#endif
-
- /* Enable PAE and LA57 (if required) paging modes */
- orl $X86_CR4_PAE, %eax
- testl %edx, %edx
- jz 1f
- orl $X86_CR4_LA57, %eax
1:
+ /* Toggle CR4.LA57 */
+ movl %cr4, %eax
+ btcl $X86_CR4_LA57_BIT, %eax
movl %eax, %cr4
- /* Calculate address of paging_enabled() once we are executing in the trampoline */
- leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
-
- /* Prepare the stack for far return to Long Mode */
- pushl $__KERNEL_CS
- pushl %eax
-
/* Enable paging again. */
movl %cr0, %eax
btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
- lret
+ /*
+ * Return to the 64-bit calling code using LJMP rather than LRET, to
+ * avoid the need for a 32-bit addressable stack. The destination
+ * address will be adjusted after the template code is copied into a
+ * 32-bit addressable buffer.
+ */
+.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
SYM_CODE_END(trampoline_32bit_src)
- .code64
-SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled)
- /* Return from the trampoline */
- jmp *%rdi
-SYM_FUNC_END(.Lpaging_enabled)
+/*
+ * This symbol is placed right after trampoline_32bit_src() so its address can
+ * be used to infer the size of the trampoline code.
+ */
+SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
/*
* The trampoline code has a size limit.
@@ -681,7 +581,7 @@ SYM_FUNC_END(.Lpaging_enabled)
*/
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
- .code32
+ .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
@@ -726,8 +626,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
*/
.bss
.balign 4
-SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0)
-
SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
.balign 16
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 94b7abcf624b..f711f2a85862 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -330,6 +330,33 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
+const unsigned long kernel_total_size = VO__end - VO__text;
+
+static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
+
+extern unsigned char input_data[];
+extern unsigned int input_len, output_len;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x))
+{
+ unsigned long entry;
+
+ if (!free_mem_ptr) {
+ free_mem_ptr = (unsigned long)boot_heap;
+ free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
+ }
+
+ if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
+ NULL, error) < 0)
+ return ULONG_MAX;
+
+ entry = parse_elf(outbuf);
+ handle_relocations(outbuf, output_len, virt_addr);
+
+ return entry;
+}
+
/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
@@ -347,14 +374,10 @@ static size_t parse_elf(void *output)
* |-------uncompressed kernel image---------|
*
*/
-asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
- unsigned char *input_data,
- unsigned long input_len,
- unsigned char *output,
- unsigned long output_len)
+asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
{
- const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ memptr heap = (memptr)boot_heap;
unsigned long needed_size;
size_t entry_offset;
@@ -412,7 +435,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
* entries. This ensures the full mapped area is usable RAM
* and doesn't include any reserved areas.
*/
- needed_size = max(output_len, kernel_total_size);
+ needed_size = max_t(unsigned long, output_len, kernel_total_size);
#ifdef CONFIG_X86_64
needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
#endif
@@ -443,7 +466,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
- if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
@@ -461,10 +484,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
accept_memory(__pa(output), __pa(output) + needed_size);
}
- __decompress(input_data, input_len, NULL, NULL, output, output_len,
- NULL, error);
- entry_offset = parse_elf(output);
- handle_relocations(output, output_len, virt_addr);
+ entry_offset = decompress_kernel(output, virt_addr, error);
debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
debug_puthex(entry_offset);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 964fe903a1cd..cc70d3fb9049 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -179,9 +179,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#endif
/* ident_map_64.c */
-#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
-#endif
extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index cc9b2529a086..6d595abe06b3 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -3,18 +3,16 @@
#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
-#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
-
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
-
-#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
+#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
#ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit;
-extern void trampoline_32bit_src(void *return_ptr);
+extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
+
+extern const u16 trampoline_ljmp_imm_offset;
#endif /* __ASSEMBLER__ */
#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 2ac12ff4111b..7939eb6e6ce9 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1;
#endif
-struct paging_config {
- unsigned long trampoline_start;
- unsigned long l5_required;
-};
-
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
* purposes.
*
* Avoid putting the pointer into .bss as it will be cleared between
- * paging_prepare() and extract_kernel().
+ * configure_5level_paging() and extract_kernel().
*/
unsigned long *trampoline_32bit __section(".data");
@@ -106,12 +101,13 @@ static unsigned long find_trampoline_placement(void)
return bios_start - TRAMPOLINE_32BIT_SIZE;
}
-struct paging_config paging_prepare(void *rmode)
+asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
{
- struct paging_config paging_config = {};
+ void (*toggle_la57)(void *cr3);
+ bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
- boot_params = rmode;
+ boot_params = bp;
/*
* Check if LA57 is desired and supported.
@@ -129,12 +125,22 @@ struct paging_config paging_prepare(void *rmode)
!cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
- paging_config.l5_required = 1;
+ l5_required = true;
+
+ /* Initialize variables for 5-level paging */
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
}
- paging_config.trampoline_start = find_trampoline_placement();
+ /*
+ * The trampoline will not be used if the paging mode is already set to
+ * the desired one.
+ */
+ if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
+ return;
- trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
+ trampoline_32bit = (unsigned long *)find_trampoline_placement();
/* Preserve trampoline memory */
memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
@@ -143,32 +149,32 @@ struct paging_config paging_prepare(void *rmode)
memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
/* Copy trampoline code in place */
- memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
+ toggle_la57 = memcpy(trampoline_32bit +
+ TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
&trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
/*
+ * Avoid the need for a stack in the 32-bit trampoline code, by using
+ * LJMP rather than LRET to return back to long mode. LJMP takes an
+ * immediate absolute address, which needs to be adjusted based on the
+ * placement of the trampoline.
+ */
+ *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
+ (unsigned long)toggle_la57;
+
+ /*
* The code below prepares page table in trampoline memory.
*
* The new page table will be used by trampoline code for switching
* from 4- to 5-level paging or vice versa.
- *
- * If switching is not required, the page table is unused: trampoline
- * code wouldn't touch CR3.
- */
-
- /*
- * We are not going to use the page table in trampoline memory if we
- * are already in the desired paging mode.
*/
- if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
- goto out;
- if (paging_config.l5_required) {
+ if (l5_required) {
/*
* For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table.
*/
- trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
+ *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
} else {
unsigned long src;
@@ -181,38 +187,17 @@ struct paging_config paging_prepare(void *rmode)
* may be above 4G.
*/
src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
- memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
- (void *)src, PAGE_SIZE);
+ memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
}
-out:
- return paging_config;
-}
-
-void cleanup_trampoline(void *pgtable)
-{
- void *trampoline_pgtable;
-
- trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
+ toggle_la57(trampoline_32bit);
/*
- * Move the top level page table out of trampoline memory,
- * if it's there.
+ * Move the top level page table out of trampoline memory.
*/
- if ((void *)__native_read_cr3() == trampoline_pgtable) {
- memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
- native_write_cr3((unsigned long)pgtable);
- }
+ memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
/* Restore trampoline memory */
memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
-
- /* Initialize variables for 5-level paging */
-#ifdef CONFIG_X86_5LEVEL
- if (__read_cr4() & X86_CR4_LA57) {
- __pgtable_l5_enabled = 1;
- pgdir_shift = 48;
- ptrs_per_p4d = 512;
- }
-#endif
}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index e83e71089cf1..dc8c876fbd8f 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -367,20 +367,25 @@ static void enforce_vmpl0(void)
*/
#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP
+u64 snp_get_unsupported_features(u64 status)
+{
+ if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
+ return 0;
+
+ return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+}
+
void snp_check_features(void)
{
u64 unsupported;
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
/*
* Terminate the boot if hypervisor has enabled any feature lacking
* guest side implementation. Pass on the unsupported features mask through
* EXIT_INFO_2 of the GHCB protocol so that those features can be reported
* as part of the guest boot failure.
*/
- unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+ unsupported = snp_get_unsupported_features(sev_status);
if (unsupported) {
if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
@@ -390,35 +395,22 @@ void snp_check_features(void)
}
}
-void sev_enable(struct boot_params *bp)
+/*
+ * sev_check_cpu_support - Check for SEV support in the CPU capabilities
+ *
+ * Returns < 0 if SEV is not supported, otherwise the position of the
+ * encryption bit in the page table descriptors.
+ */
+static int sev_check_cpu_support(void)
{
unsigned int eax, ebx, ecx, edx;
- struct msr m;
- bool snp;
-
- /*
- * bp->cc_blob_address should only be set by boot/compressed kernel.
- * Initialize it to 0 to ensure that uninitialized values from
- * buggy bootloaders aren't propagated.
- */
- if (bp)
- bp->cc_blob_address = 0;
-
- /*
- * Do an initial SEV capability check before snp_init() which
- * loads the CPUID page and the same checks afterwards are done
- * without the hypervisor and are trustworthy.
- *
- * If the HV fakes SEV support, the guest will crash'n'burn
- * which is good enough.
- */
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
- return;
+ return -ENODEV;
/*
* Check for the SME/SEV feature:
@@ -433,6 +425,35 @@ void sev_enable(struct boot_params *bp)
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
+ return -ENODEV;
+
+ return ebx & 0x3f;
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ struct msr m;
+ int bitpos;
+ bool snp;
+
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+
+ /*
+ * Do an initial SEV capability check before snp_init() which
+ * loads the CPUID page and the same checks afterwards are done
+ * without the hypervisor and are trustworthy.
+ *
+ * If the HV fakes SEV support, the guest will crash'n'burn
+ * which is good enough.
+ */
+
+ if (sev_check_cpu_support() < 0)
return;
/*
@@ -443,26 +464,8 @@ void sev_enable(struct boot_params *bp)
/* Now repeat the checks with the SNP CPUID table. */
- /* Recheck the SME/SEV support leaf */
- eax = 0x80000000;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (eax < 0x8000001f)
- return;
-
- /*
- * Recheck for the SME/SEV feature:
- * CPUID Fn8000_001F[EAX]
- * - Bit 0 - Secure Memory Encryption support
- * - Bit 1 - Secure Encrypted Virtualization support
- * CPUID Fn8000_001F[EBX]
- * - Bits 5:0 - Pagetable bit position used to indicate encryption
- */
- eax = 0x8000001f;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- /* Check whether SEV is supported */
- if (!(eax & BIT(1))) {
+ bitpos = sev_check_cpu_support();
+ if (bitpos < 0) {
if (snp)
error("SEV-SNP support indicated by CC blob, but not CPUID.");
return;
@@ -494,7 +497,24 @@ void sev_enable(struct boot_params *bp)
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
- sme_me_mask = BIT_ULL(ebx & 0x3f);
+ sme_me_mask = BIT_ULL(bitpos);
+}
+
+/*
+ * sev_get_status - Retrieve the SEV status mask
+ *
+ * Returns 0 if the CPU is not SEV capable, otherwise the value of the
+ * AMD64_SEV MSR.
+ */
+u64 sev_get_status(void)
+{
+ struct msr m;
+
+ if (sev_check_cpu_support() < 0)
+ return 0;
+
+ boot_rdmsr(MSR_AMD64_SEV, &m);
+ return m.q;
}
/* Search for Confidential Computing blob in the EFI config table. */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index bc0a3c941b35..2d0b1bd866ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -456,3 +456,4 @@
449 i386 futex_waitv sys_futex_waitv
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat
+452 i386 fchmodat2 sys_fchmodat2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 227538b0ce80..814768249eae 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -373,6 +373,7 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 9191280d9ea3..4ae14339cb8c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -62,4 +62,12 @@
# define BOOT_STACK_SIZE 0x1000
#endif
+#ifndef __ASSEMBLY__
+extern unsigned int output_len;
+extern const unsigned long kernel_total_size;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x));
+#endif
+
#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8b4be7cecdb8..b0994ae3bc23 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,8 @@ static inline void efi_fpu_end(void)
}
#ifdef CONFIG_X86_32
+#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
+
#define arch_efi_call_virt_setup() \
({ \
efi_fpu_begin(); \
@@ -103,8 +105,7 @@ static inline void efi_fpu_end(void)
})
#else /* !CONFIG_X86_32 */
-
-#define EFI_LOADER_SIGNATURE "EL64"
+#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
extern asmlinkage u64 __efi_call(void *fp, ...);
@@ -218,6 +219,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
#ifdef CONFIG_EFI_MIXED
+#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX)
+
#define ARCH_HAS_EFISTUB_WRAPPERS
static inline bool efi_is_64bit(void)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 117903881fe4..ce8f50192ae3 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -92,6 +92,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
mds_user_clear_cpu_buffers();
+ amd_clear_divider();
}
#define arch_exit_to_user_mode arch_exit_to_user_mode
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 3faf044569a5..c55cc243592e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -272,9 +272,9 @@
.endm
#ifdef CONFIG_CPU_UNRET_ENTRY
-#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret"
+#define CALL_UNTRAIN_RET "call entry_untrain_ret"
#else
-#define CALL_ZEN_UNTRAIN_RET ""
+#define CALL_UNTRAIN_RET ""
#endif
/*
@@ -282,7 +282,7 @@
* return thunk isn't mapped into the userspace tables (then again, AMD
* typically has NO_MELTDOWN).
*
- * While zen_untrain_ret() doesn't clobber anything but requires stack,
+ * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
* entry_ibpb() will clobber AX, CX, DX.
*
* As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
@@ -293,14 +293,20 @@
defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
+.endm
-#ifdef CONFIG_CPU_SRSO
- ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
- "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
+.macro UNTRAIN_RET_VM
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
+ VALIDATE_UNRET_END
+ ALTERNATIVE_3 "", \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \
+ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
@@ -309,15 +315,10 @@
defined(CONFIG_CALL_DEPTH_TRACKING)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
#endif
-
-#ifdef CONFIG_CPU_SRSO
- ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
- "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
-#endif
.endm
@@ -341,17 +342,24 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
+#ifdef CONFIG_RETHUNK
extern void __x86_return_thunk(void);
-extern void zen_untrain_ret(void);
+#else
+static inline void __x86_return_thunk(void) {}
+#endif
+
+extern void retbleed_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
+extern void retbleed_untrain_ret(void);
extern void srso_untrain_ret(void);
-extern void srso_untrain_ret_alias(void);
+extern void srso_alias_untrain_ret(void);
+
+extern void entry_untrain_ret(void);
extern void entry_ibpb(void);
-#ifdef CONFIG_CALL_THUNKS
extern void (*x86_return_thunk)(void);
-#else
-#define x86_return_thunk (&__x86_return_thunk)
-#endif
#ifdef CONFIG_CALL_DEPTH_TRACKING
extern void __x86_return_skl(void);
@@ -478,9 +486,6 @@ enum ssb_mitigation {
SPEC_STORE_BYPASS_SECCOMP,
};
-extern char __indirect_thunk_start[];
-extern char __indirect_thunk_end[];
-
static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 66c806784c52..5b4a1ce3d368 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -164,6 +164,7 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern void sev_enable(struct boot_params *bp);
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{
@@ -210,12 +211,15 @@ bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 snp_get_unsupported_features(u64 status);
+u64 sev_get_status(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
static inline void setup_ghcb(void) { }
@@ -235,6 +239,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
}
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
+static inline u64 sev_get_status(void) { return 0; }
#endif
#endif
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index caf41c4869a0..3235ba1e5b06 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -136,10 +136,11 @@ static inline int topology_max_smt_threads(void)
return __max_smt_threads;
}
+#include <linux/cpu_smt.h>
+
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
int topology_update_die_map(unsigned int dieid, unsigned int cpu);
int topology_phys_to_logical_pkg(unsigned int pkg);
-bool topology_smt_supported(void);
extern struct cpumask __cpu_primary_thread_mask;
#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
@@ -162,7 +163,6 @@ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_die_per_package(void) { return 1; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
-static inline bool topology_smt_supported(void) { return false; }
#endif /* !CONFIG_SMP */
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2dcf3a06af09..099d58d02a26 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -687,10 +687,6 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
#ifdef CONFIG_RETHUNK
-#ifdef CONFIG_CALL_THUNKS
-void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
-#endif
-
/*
* Rewrite the compiler generated return thunk tail-calls.
*
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 035a3db5330b..356de955e78d 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -24,6 +24,8 @@
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
+#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
+#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
@@ -39,6 +41,7 @@
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
+#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
/* Protect the PCI config register pairs used for SMN. */
@@ -56,6 +59,8 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
{}
};
@@ -85,6 +90,8 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{}
};
@@ -106,6 +113,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
{}
};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 70f9d56f9305..7eca6a8abbb1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1329,3 +1329,4 @@ void noinstr amd_clear_divider(void)
asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
:: "a" (0), "d" (0), "r" (1));
}
+EXPORT_SYMBOL_GPL(amd_clear_divider);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d02f73c5339d..f081d26616ac 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,6 +63,8 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd);
static DEFINE_MUTEX(spec_ctrl_mutex);
+void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
{
@@ -165,6 +167,11 @@ void __init cpu_select_mitigations(void)
md_clear_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+
+ /*
+ * srso_select_mitigation() depends and must run after
+ * retbleed_select_mitigation().
+ */
srso_select_mitigation();
gds_select_mitigation();
}
@@ -1035,6 +1042,9 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
+ if (IS_ENABLED(CONFIG_RETHUNK))
+ x86_return_thunk = retbleed_return_thunk;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
pr_err(RETBLEED_UNTRAIN_MSG);
@@ -1044,6 +1054,7 @@ do_cmd_auto:
case RETBLEED_MITIGATION_IBPB:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true;
break;
@@ -2417,9 +2428,10 @@ static void __init srso_select_mitigation(void)
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
*/
- if ((boot_cpu_data.x86 < 0x19) &&
- (!cpu_smt_possible() || (cpu_smt_control == CPU_SMT_DISABLED)))
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ return;
+ }
}
if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
@@ -2448,11 +2460,15 @@ static void __init srso_select_mitigation(void)
* like ftrace, static_call, etc.
*/
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
- if (boot_cpu_data.x86 == 0x19)
+ if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- else
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
@@ -2696,6 +2712,9 @@ static ssize_t retbleed_show_state(char *buf)
static ssize_t srso_show_state(char *buf)
{
+ if (boot_cpu_has(X86_FEATURE_SRSO_NO))
+ return sysfs_emit(buf, "Mitigation: SMT disabled\n");
+
return sysfs_emit(buf, "%s%s\n",
srso_strings[srso_mitigation],
(cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e3a65e9fc750..281fc3f6ea6b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2343,7 +2343,7 @@ void __init arch_cpu_finalize_init(void)
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_check_topology();
+ cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings);
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 89e2aab5d34d..6f35f724cc14 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -843,6 +843,26 @@ static noinstr bool quirk_skylake_repmov(void)
}
/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
@@ -861,6 +881,9 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
if (mce_flags.snb_ifu_quirk)
quirk_sandybridge_ifu(i, m, regs);
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
@@ -1608,6 +1631,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
local_irq_restore(flags);
}
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1618,7 +1648,7 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ mc_poll_banks();
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
@@ -1842,6 +1872,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..f5323551c1a9 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
#define CMCI_STORM_INTERVAL (HZ)
@@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
void intel_init_cmci(void)
{
int banks;
- if (!cmci_supported(&banks))
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
return;
+ }
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index d2412ce2d312..bcf1b3c66c9c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -157,6 +157,9 @@ struct mce_vendor_flags {
*/
smca : 1,
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
@@ -172,7 +175,7 @@ struct mce_vendor_flags {
/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
skx_repmov_quirk : 1,
- __reserved_0 : 56;
+ __reserved_0 : 55;
};
extern struct mce_vendor_flags mce_flags;
@@ -274,4 +277,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
return 0;
}
+extern void (*mc_poll_banks)(void);
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index af5cbdd9bd29..f6d856bd50bc 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -19,8 +19,7 @@
* FPU state for a task MUST let the rest of the kernel know that the
* FPU registers are no longer valid for this task.
*
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
+ * Invalidate a resource you control: CPU if using the CPU for something else
* (with preemption disabled), FPU for the current task, or a task that
* is prevented from running by the current task.
*/
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1015af1ae562..98e507cc7d34 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -679,7 +679,7 @@ static void fpu_reset_fpregs(void)
struct fpu *fpu = &current->thread.fpu;
fpregs_lock();
- fpu__drop(fpu);
+ __fpu_invalidate_fpregs_state(fpu);
/*
* This does not change the actual hardware registers. It just
* resets the memory image and sets TIF_NEED_FPU_LOAD so a
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0bab497c9436..1afbc4866b10 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -882,6 +882,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ /*
+ * CPU capabilities initialization runs before FPU init. So
+ * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
+ * functional, set the feature bit so depending code works.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
+
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
fpu_kernel_cfg.max_features,
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c5b9289837dc..ea6995920b7a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64)
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %rsi holds a physical pointer to real_mode_data.
+ * %RSI holds the physical address of the boot_params structure
+ * provided by the bootloader. Preserve it in %R15 so C function calls
+ * will not clobber it.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86/boot/compressed/head_64.S.
@@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64)
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
+ mov %rsi, %r15
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
@@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
- pushq %rsi
call startup_64_setup_env
- popq %rsi
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64)
* Activate SEV/SME memory encryption if supported/enabled. This needs to
* be done now, since this also includes setup of the SEV-SNP CPUID table,
* which needs to be done before any CPUID instructions are executed in
- * subsequent code.
+ * subsequent code. Pass the boot_params pointer as the first argument.
*/
- movq %rsi, %rdi
- pushq %rsi
+ movq %r15, %rdi
call sme_enable
- popq %rsi
#endif
/* Sanitize CPU configuration */
@@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* programmed into CR3.
*/
leaq _text(%rip), %rdi
- pushq %rsi
+ movq %r15, %rsi
call __startup_64
- popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
@@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
- * %rsi holds a physical pointer to real_mode_data.
- *
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
@@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
+ /* Clear %R15 which holds the boot_params pointer on the boot CPU */
+ xorq %r15, %r15
+
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
@@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* hypervisor could lie about the C-bit position to perform a ROP
* attack on the guest by writing to the unencrypted stack and wait for
* the next RET instruction.
- * %rsi carries pointer to realmode data and is callee-clobbered. Save
- * and restore it.
*/
- pushq %rsi
movq %rax, %rdi
call sev_verify_cbit
- popq %rsi
/*
* Switch to new page-table
@@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
wrmsr
/* Setup and Load IDT */
- pushq %rsi
call early_setup_idt
- popq %rsi
/* Check if nx is implemented */
movl $0x80000001, %eax
@@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
pushq $0
popfq
- /* rsi is pointer to real mode structure with interesting info.
- pass it to C */
- movq %rsi, %rdi
+ /* Pass the boot_params pointer as first argument */
+ movq %r15, %rdi
.Ljump_to_C_code:
/*
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 57b0037d0a99..517821b48391 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -226,7 +226,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
}
/* Check whether insn is indirect jump */
-static int __insn_is_indirect_jump(struct insn *insn)
+static int insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -260,26 +260,6 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
return (start <= target && target <= start + len);
}
-static int insn_is_indirect_jump(struct insn *insn)
-{
- int ret = __insn_is_indirect_jump(insn);
-
-#ifdef CONFIG_RETPOLINE
- /*
- * Jump to x86_indirect_thunk_* is treated as an indirect jump.
- * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
- * older gcc may use indirect jump. So we add this check instead of
- * replace indirect-jump check.
- */
- if (!ret)
- ret = insn_jump_into_range(insn,
- (unsigned long)__indirect_thunk_start,
- (unsigned long)__indirect_thunk_end -
- (unsigned long)__indirect_thunk_start);
-#endif
- return ret;
-}
-
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -334,9 +314,21 @@ static int can_optimize(unsigned long paddr)
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
- /* Check any instructions don't jump into target */
- if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ /*
+ * Check any instructions don't jump into target, indirectly or
+ * directly.
+ *
+ * The indirect case is present to handle a code with jump
+ * tables. When the kernel uses retpolines, the check should in
+ * theory additionally look for jumps to indirect thunks.
+ * However, the kernel built with retpolines or IBT has jump
+ * tables disabled so the check can be skipped altogether.
+ */
+ if (!IS_ENABLED(CONFIG_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
+ insn_is_indirect_jump(&insn))
+ return 0;
+ if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
DISP32_SIZE))
return 0;
addr += insn.length;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ac10b46c5832..23d4d7114473 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -79,6 +79,11 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
+static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ tlb_remove_page(tlb, table);
+}
+
unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
unsigned int len)
{
@@ -295,8 +300,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table =
- (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+ .mmu.tlb_remove_table = native_tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e1aa2cd7734b..d4e897b820c4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -327,14 +327,6 @@ static void notrace start_secondary(void *unused)
}
/**
- * topology_smt_supported - Check whether SMT is supported by the CPUs
- */
-bool topology_smt_supported(void)
-{
- return smp_num_siblings > 1;
-}
-
-/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
* @phys_pkg: The physical package id to map
*
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index b70670a98597..77a9316da435 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -186,6 +186,19 @@ EXPORT_SYMBOL_GPL(arch_static_call_transform);
*/
bool __static_call_fixup(void *tramp, u8 op, void *dest)
{
+ unsigned long addr = (unsigned long)tramp;
+ /*
+ * Not all .return_sites are a static_call trampoline (most are not).
+ * Check if the 3 bytes after the return are still kernel text, if not,
+ * then this definitely is not a trampoline and we need not worry
+ * further.
+ *
+ * This avoids the memcmp() below tripping over pagefaults etc..
+ */
+ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) &&
+ !kernel_text_address(addr + 7))
+ return false;
+
if (memcmp(tramp+5, tramp_ud, 3)) {
/* Not a trampoline site, not our problem. */
return false;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1885326a8f65..4a817d20ce3b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -206,8 +206,6 @@ DEFINE_IDTENTRY(exc_divide_error)
{
do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
FPE_INTDIV, error_get_trap_addr(regs));
-
- amd_clear_divider();
}
DEFINE_IDTENTRY(exc_overflow)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3425c6a943e4..15f97c0abc9d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1258,7 +1258,7 @@ static void __init check_system_tsc_reliable(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
- nr_online_nodes <= 2)
+ nr_online_nodes <= 4)
tsc_disable_clocksource_watchdog();
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ef06211bae4c..83d41c2601d7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -133,27 +133,25 @@ SECTIONS
KPROBES_TEXT
SOFTIRQENTRY_TEXT
#ifdef CONFIG_RETPOLINE
- __indirect_thunk_start = .;
- *(.text.__x86.indirect_thunk)
- *(.text.__x86.return_thunk)
- __indirect_thunk_end = .;
+ *(.text..__x86.indirect_thunk)
+ *(.text..__x86.return_thunk)
#endif
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
#ifdef CONFIG_CPU_SRSO
- *(.text.__x86.rethunk_untrain)
+ *(.text..__x86.rethunk_untrain)
#endif
ENTRY_TEXT
#ifdef CONFIG_CPU_SRSO
/*
- * See the comment above srso_untrain_ret_alias()'s
+ * See the comment above srso_alias_untrain_ret()'s
* definition.
*/
- . = srso_untrain_ret_alias | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
- *(.text.__x86.rethunk_safe)
+ . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+ *(.text..__x86.rethunk_safe)
#endif
ALIGN_ENTRY_TEXT_END
*(.gnu.warning)
@@ -523,7 +521,7 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif
#ifdef CONFIG_RETHUNK
-. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif
@@ -538,8 +536,8 @@ INIT_PER_CPU(irq_stack_backing_store);
* Instead do: (A | B) - (A & B) in order to compute the XOR
* of the two function addresses:
*/
-. = ASSERT(((ABSOLUTE(srso_untrain_ret_alias) | srso_safe_ret_alias) -
- (ABSOLUTE(srso_untrain_ret_alias) & srso_safe_ret_alias)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) -
+ (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
"SRSO function pair won't alias");
#endif
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 03e852dedcc1..d4bfdc607fe7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4006,6 +4006,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ amd_clear_divider();
+
if (sev_es_guest(vcpu->kvm))
__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 265452fc9ebe..ef2ebabb059c 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -222,10 +222,7 @@ SYM_FUNC_START(__svm_vcpu_run)
* because interrupt handlers won't sanitize 'ret' if the return is
* from the kernel.
*/
- UNTRAIN_RET
-
- /* SRSO */
- ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT
+ UNTRAIN_RET_VM
/*
* Clear all general purpose registers except RSP and RAX to prevent
@@ -362,7 +359,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
* because interrupt handlers won't sanitize RET if the return is
* from the kernel.
*/
- UNTRAIN_RET
+ UNTRAIN_RET_VM
/* "Pop" @spec_ctrl_intercepted. */
pop %_ASM_BX
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 2cff585f22f2..cd86aeb5fdd3 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -13,7 +13,7 @@
#include <asm/frame.h>
#include <asm/nops.h>
- .section .text.__x86.indirect_thunk
+ .section .text..__x86.indirect_thunk
.macro POLINE reg
@@ -133,75 +133,106 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
#ifdef CONFIG_RETHUNK
/*
- * srso_untrain_ret_alias() and srso_safe_ret_alias() are placed at
+ * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at
* special addresses:
*
- * - srso_untrain_ret_alias() is 2M aligned
- * - srso_safe_ret_alias() is also in the same 2M page but bits 2, 8, 14
+ * - srso_alias_untrain_ret() is 2M aligned
+ * - srso_alias_safe_ret() is also in the same 2M page but bits 2, 8, 14
* and 20 in its virtual address are set (while those bits in the
- * srso_untrain_ret_alias() function are cleared).
+ * srso_alias_untrain_ret() function are cleared).
*
* This guarantees that those two addresses will alias in the branch
* target buffer of Zen3/4 generations, leading to any potential
* poisoned entries at that BTB slot to get evicted.
*
- * As a result, srso_safe_ret_alias() becomes a safe return.
+ * As a result, srso_alias_safe_ret() becomes a safe return.
*/
#ifdef CONFIG_CPU_SRSO
- .section .text.__x86.rethunk_untrain
+ .section .text..__x86.rethunk_untrain
-SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
ASM_NOP2
lfence
- jmp __x86_return_thunk
-SYM_FUNC_END(srso_untrain_ret_alias)
-__EXPORT_THUNK(srso_untrain_ret_alias)
-
- .section .text.__x86.rethunk_safe
+ jmp srso_alias_return_thunk
+SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
+
+ .section .text..__x86.rethunk_safe
+#else
+/* dummy definition for alternatives */
+SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(srso_alias_untrain_ret)
#endif
-/* Needs a definition for the __x86_return_thunk alternative below. */
-SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
-#ifdef CONFIG_CPU_SRSO
- add $8, %_ASM_SP
+SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ lea 8(%_ASM_SP), %_ASM_SP
UNWIND_HINT_FUNC
-#endif
ANNOTATE_UNRET_SAFE
ret
int3
-SYM_FUNC_END(srso_safe_ret_alias)
+SYM_FUNC_END(srso_alias_safe_ret)
- .section .text.__x86.return_thunk
+ .section .text..__x86.return_thunk
+
+SYM_CODE_START(srso_alias_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_alias_safe_ret
+ ud2
+SYM_CODE_END(srso_alias_return_thunk)
+
+/*
+ * Some generic notes on the untraining sequences:
+ *
+ * They are interchangeable when it comes to flushing potentially wrong
+ * RET predictions from the BTB.
+ *
+ * The SRSO Zen1/2 (MOVABS) untraining sequence is longer than the
+ * Retbleed sequence because the return sequence done there
+ * (srso_safe_ret()) is longer and the return sequence must fully nest
+ * (end before) the untraining sequence. Therefore, the untraining
+ * sequence must fully overlap the return sequence.
+ *
+ * Regarding alignment - the instructions which need to be untrained,
+ * must all start at a cacheline boundary for Zen1/2 generations. That
+ * is, instruction sequences starting at srso_safe_ret() and
+ * the respective instruction sequences at retbleed_return_thunk()
+ * must start at a cacheline boundary.
+ */
/*
* Safety details here pertain to the AMD Zen{1,2} microarchitecture:
- * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for
+ * 1) The RET at retbleed_return_thunk must be on a 64 byte boundary, for
* alignment within the BTB.
- * 2) The instruction at zen_untrain_ret must contain, and not
+ * 2) The instruction at retbleed_untrain_ret must contain, and not
* end with, the 0xc3 byte of the RET.
* 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__ret - zen_untrain_ret), 0xcc
-SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc
+SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
- * As executed from zen_untrain_ret, this is:
+ * As executed from retbleed_untrain_ret, this is:
*
* TEST $0xcc, %bl
* LFENCE
- * JMP __x86_return_thunk
+ * JMP retbleed_return_thunk
*
* Executing the TEST instruction has a side effect of evicting any BTB
* prediction (potentially attacker controlled) attached to the RET, as
- * __x86_return_thunk + 1 isn't an instruction boundary at the moment.
+ * retbleed_return_thunk + 1 isn't an instruction boundary at the moment.
*/
.byte 0xf6
/*
- * As executed from __x86_return_thunk, this is a plain RET.
+ * As executed from retbleed_return_thunk, this is a plain RET.
*
* As part of the TEST above, RET is the ModRM byte, and INT3 the imm8.
*
@@ -213,13 +244,13 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
* With SMT enabled and STIBP active, a sibling thread cannot poison
* RET's prediction to a type of its choice, but can evict the
* prediction due to competitive sharing. If the prediction is
- * evicted, __x86_return_thunk will suffer Straight Line Speculation
+ * evicted, retbleed_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(retbleed_return_thunk, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__ret)
+SYM_CODE_END(retbleed_return_thunk)
/*
* Ensure the TEST decoding / BTB invalidation is complete.
@@ -230,16 +261,16 @@ SYM_CODE_END(__ret)
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __ret
+ jmp retbleed_return_thunk
int3
-SYM_FUNC_END(zen_untrain_ret)
-__EXPORT_THUNK(zen_untrain_ret)
+SYM_FUNC_END(retbleed_untrain_ret)
+__EXPORT_THUNK(retbleed_untrain_ret)
/*
- * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
+ * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
* above. On kernel entry, srso_untrain_ret() is executed which is a
*
- * movabs $0xccccccc308c48348,%rax
+ * movabs $0xccccc30824648d48,%rax
*
* and when the return thunk executes the inner label srso_safe_ret()
* later, it is a stack manipulation and a RET which is mispredicted and
@@ -251,22 +282,44 @@ SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
.byte 0x48, 0xb8
+/*
+ * This forces the function return instruction to speculate into a trap
+ * (UD2 in srso_return_thunk() below). This RET will then mispredict
+ * and execution will continue at the return site read from the top of
+ * the stack.
+ */
SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
- add $8, %_ASM_SP
+ lea 8(%_ASM_SP), %_ASM_SP
ret
int3
int3
- int3
+ /* end of movabs */
lfence
call srso_safe_ret
- int3
+ ud2
SYM_CODE_END(srso_safe_ret)
SYM_FUNC_END(srso_untrain_ret)
__EXPORT_THUNK(srso_untrain_ret)
-SYM_FUNC_START(__x86_return_thunk)
- ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
- "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
+SYM_CODE_START(srso_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_return_thunk)
+
+SYM_FUNC_START(entry_untrain_ret)
+ ALTERNATIVE_2 "jmp retbleed_untrain_ret", \
+ "jmp srso_untrain_ret", X86_FEATURE_SRSO, \
+ "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
+SYM_FUNC_END(entry_untrain_ret)
+__EXPORT_THUNK(entry_untrain_ret)
+
+SYM_CODE_START(__x86_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
int3
SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)