From 1da5c9bc119d3a749b519596b93f9b2667e93c4a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 23 Jun 2023 14:14:04 +0300 Subject: x86: Introduce ia32_enabled() IA32 support on 64bit kernels depends on whether CONFIG_IA32_EMULATION is selected or not. As it is a compile time option it doesn't provide the flexibility to have distributions set their own policy for IA32 support and give the user the flexibility to override it. As a first step introduce ia32_enabled() which abstracts whether IA32 compat is turned on or off. Upcoming patches will implement the ability to set IA32 compat state at boot time. Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230623111409.3047467-2-nik.borisov@suse.com --- arch/x86/entry/common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6c2826417b33..cfbd3aec3ddc 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -96,6 +96,10 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs) return (int)regs->orig_ax; } +#ifdef CONFIG_IA32_EMULATION +bool __ia32_enabled __ro_after_init = true; +#endif + /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ -- cgit v1.2.3 From f71e1d2ff8e6a183bd4004bc97c453ba527b7dc6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 23 Jun 2023 14:14:05 +0300 Subject: x86/entry: Rename ignore_sysret() The SYSCALL instruction cannot really be disabled in compatibility mode. The best that can be done is to configure the CSTAR msr to point to a minimal handler. Currently this handler has a rather misleading name - ignore_sysret() as it's not really doing anything with sysret. Give it a more descriptive name. Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230623111409.3047467-3-nik.borisov@suse.com --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 43606de22511..3be71accb7a0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1516,12 +1516,12 @@ SYM_CODE_END(asm_exc_nmi) * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -SYM_CODE_START(ignore_sysret) +SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax sysretl -SYM_CODE_END(ignore_sysret) +SYM_CODE_END(entry_SYSCALL32_ignore) #endif .pushsection .text, "ax" diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0086920cda06..93cd28d6f317 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -399,7 +399,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 382d4e6b848d..b3f8cba21353 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2087,7 +2087,7 @@ void syscall_init(void) (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl_cstar((unsigned long)ignore_sysret); + wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -- cgit v1.2.3 From 370dcd58548a360bbf8a65b89b410d09f56bf0c6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 23 Jun 2023 14:14:06 +0300 Subject: x86/entry: Compile entry_SYSCALL32_ignore() unconditionally To limit the IA32 exposure on 64bit kernels while keeping the flexibility for the user to enable it when required, the compile time enable/disable via CONFIG_IA32_EMULATION is not good enough and will be complemented with a kernel command line option. Right now entry_SYSCALL32_ignore() is only compiled when CONFIG_IA32_EMULATION=n, but boot-time enable- / disablement obviously requires it to be unconditionally available. Remove the #ifndef CONFIG_IA32_EMULATION guard. Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230623111409.3047467-4-nik.borisov@suse.com --- arch/x86/entry/entry_64.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3be71accb7a0..f71664dea1a3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1511,7 +1511,6 @@ nmi_restore: iretq SYM_CODE_END(asm_exc_nmi) -#ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. @@ -1522,7 +1521,6 @@ SYM_CODE_START(entry_SYSCALL32_ignore) mov $-ENOSYS, %eax sysretl SYM_CODE_END(entry_SYSCALL32_ignore) -#endif .pushsection .text, "ax" __FUNC_ALIGN -- cgit v1.2.3 From a11e097504ac1889b35b6858f495565838325f88 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 23 Jun 2023 14:14:09 +0300 Subject: x86: Make IA32_EMULATION boot time configurable Distributions would like to reduce their attack surface as much as possible but at the same time they'd want to retain flexibility to cater to a variety of legacy software. This stems from the conjecture that compat layer is likely rarely tested and could have latent security bugs. Ideally distributions will set their default policy and also give users the ability to override it as appropriate. To enable this use case, introduce CONFIG_IA32_EMULATION_DEFAULT_DISABLED compile time option, which controls whether 32bit processes/syscalls should be allowed or not. This option is aimed mainly at distributions to set their preferred default behavior in their kernels. To allow users to override the distro's policy, introduce the 'ia32_emulation' parameter which allows overriding CONFIG_IA32_EMULATION_DEFAULT_DISABLED state at boot time. Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230623111409.3047467-7-nik.borisov@suse.com --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/Kconfig | 9 +++++++++ arch/x86/entry/common.c | 9 ++++++++- 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/x86/entry') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..45e34be4ed56 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1893,6 +1893,12 @@ 0 -- machine default 1 -- force brightness inversion + ia32_emulation= [X86-64] + Format: + When true, allows loading 32-bit programs and executing 32-bit + syscalls, essentially overriding IA32_EMULATION_DEFAULT_DISABLED at + boot time. When false, unconditionally disables IA32 emulation. + icn= [HW,ISDN] Format: [,[,[,]]] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 982b777eadc7..c130bf3176fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2954,6 +2954,15 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. +config IA32_EMULATION_DEFAULT_DISABLED + bool "IA32 emulation disabled by default" + default n + depends on IA32_EMULATION + help + Make IA32 emulation disabled by default. This prevents loading 32-bit + processes and access to 32-bit syscalls. If unsure, leave it to its + default value. + config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cfbd3aec3ddc..a34e1a1adcf8 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_XEN_PV #include @@ -97,7 +98,13 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs) } #ifdef CONFIG_IA32_EMULATION -bool __ia32_enabled __ro_after_init = true; +bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); + +static int ia32_emulation_override_cmdline(char *arg) +{ + return kstrtobool(arg, &__ia32_enabled); +} +early_param("ia32_emulation", ia32_emulation_override_cmdline); #endif /* -- cgit v1.2.3 From eb43c9b1517b48e2ff0d3a584aca197338987d7b Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 21 Jul 2023 12:10:12 -0400 Subject: x86/entry/64: Remove obsolete comment on tracing vs. SYSRET This comment comes from a time when the kernel attempted to use SYSRET on all returns to userspace, including interrupts and exceptions. Ever since commit fffbb5dc ("Move opportunistic sysret code to syscall code path"), SYSRET is only used for returning from system calls. The specific tracing issue listed in this comment is not possible anymore. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20230721161018.50214-2-brgerst@gmail.com --- arch/x86/entry/entry_64.S | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f71664dea1a3..7574639cb0c4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -166,22 +166,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) jne swapgs_restore_regs_and_return_to_usermode /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 jnz swapgs_restore_regs_and_return_to_usermode -- cgit v1.2.3 From eec62f61e134d6711f98d4005c6439f24d03d54f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 21 Jul 2023 12:10:14 -0400 Subject: x86/entry/compat: Combine return value test from syscall handler Move the sysret32_from_system_call label to remove a duplicate test of the return value from the syscall handler. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230721161018.50214-4-brgerst@gmail.com --- arch/x86/entry/entry_64_compat.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 70150298f8bd..b16272395f1a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_SYSENTER_32 - /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ - "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_fast_syscall_32 + +sysret32_from_system_call: /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV - /* Opportunistic SYSRET */ -sysret32_from_system_call: /* + * Opportunistic SYSRET + * * We are not going to return to userspace from the trampoline * stack. So let's erase the thread stack right now. */ -- cgit v1.2.3 From 0d3109ad2e6125add5b3c88e8de3fb7bfd4e8c49 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 21 Jul 2023 12:10:15 -0400 Subject: x86/entry/32: Convert do_fast_syscall_32() to bool return type Doesn't have to be 'long' - this simplifies the code a bit. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230721161018.50214-5-brgerst@gmail.com --- arch/x86/entry/common.c | 10 +++++----- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/syscall.h | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 328990dd8632..f1f3bf150b57 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -193,8 +193,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return true; } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling @@ -212,7 +212,7 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) /* Invoke the syscall. If it failed, keep it simple: use IRET. */ if (!__do_fast_syscall_32(regs)) - return 0; + return false; #ifdef CONFIG_X86_64 /* @@ -245,8 +245,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) #endif } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) { /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ regs->sp = regs->bp; diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6e6af42e044a..c73047bf9f4b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32) movl %esp, %eax call do_SYSENTER_32 - testl %eax, %eax + testb %al, %al jz .Lsyscall_32_done STACKLEAK_ERASE diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index b16272395f1a..27c05d08558a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -212,7 +212,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) sysret32_from_system_call: /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4fb36fba4b5a..c7e25c940f1a 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -131,7 +131,7 @@ void do_syscall_64(struct pt_regs *regs, int nr); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); -long do_SYSENTER_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ -- cgit v1.2.3 From bab9fa6dc5e4483749838877deebe038de3ce97e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 21 Jul 2023 12:10:16 -0400 Subject: x86/entry/32: Remove SEP test for SYSEXIT SEP must be already be present in order for do_fast_syscall_32() to be called on native 32-bit, so checking it again is unnecessary. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230721161018.50214-6-brgerst@gmail.com --- arch/x86/entry/common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f1f3bf150b57..0551bcb197fb 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -238,8 +238,7 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) * We don't allow syscalls at all from VM86 mode, but we still * need to check VM, because we might be returning from sys_vm86. */ - return static_cpu_has(X86_FEATURE_SEP) && - regs->cs == __USER_CS && regs->ss == __USER_DS && + return regs->cs == __USER_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif -- cgit v1.2.3 From ca282b486a570a0bfda5c1a4595ace7fa14243bf Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 11 Oct 2023 18:43:49 -0400 Subject: x86/entry/64: Convert SYSRET validation tests to C No change in functionality expected. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Cc: Uros Bizjak Link: https://lore.kernel.org/r/20231011224351.130935-2-brgerst@gmail.com --- arch/x86/entry/common.c | 43 +++++++++++++++++++++++++++++++++- arch/x86/entry/entry_64.S | 53 ++---------------------------------------- arch/x86/include/asm/syscall.h | 2 +- 3 files changed, 45 insertions(+), 53 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0551bcb197fb..90214652ff06 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) return false; } -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); @@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) instrumentation_end(); syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * Change top bits to match the most significant bit (47th or 56th bit + * depending on paging mode) in the address. + */ + if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1))) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; } #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7574639cb0c4..173064004b5c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ - X86_FEATURE_XENPV - - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSRET cannot restore RF. It can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index c7e25c940f1a..f44e2f9ab65d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); +bool do_syscall_64(struct pt_regs *regs, int nr); #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 58978b44df7276f7c75a2c6aad6c201421cd4daa Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 11 Oct 2023 18:43:50 -0400 Subject: x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test Using shifts to determine if an address is canonical is difficult for the compiler to optimize when the virtual address width is variable (LA57 feature) without using inline assembly. Instead, compare RIP against TASK_SIZE_MAX. The only user executable address outside of that range is the deprecated vsyscall page, which can fall back to using IRET. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Cc: Uros Bizjak Link: https://lore.kernel.org/r/20231011224351.130935-3-brgerst@gmail.com --- arch/x86/entry/common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 90214652ff06..4c7154d097c1 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -110,10 +110,10 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * - * Change top bits to match the most significant bit (47th or 56th bit - * depending on paging mode) in the address. + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. */ - if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1))) + if (unlikely(regs->ip >= TASK_SIZE_MAX)) return false; /* -- cgit v1.2.3 From 1a09a27153f91cd7676b2d4ca574577572a8c999 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 11 Oct 2023 18:43:51 -0400 Subject: x86/entry/32: Clean up syscall fast exit tests Merge compat and native code and clarify comments. No change in functionality expected. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Cc: Uros Bizjak Link: https://lore.kernel.org/r/20231011224351.130935-4-brgerst@gmail.com --- arch/x86/entry/common.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 4c7154d097c1..d813160b14d8 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -255,34 +255,30 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) if (!__do_fast_syscall_32(regs)) return false; -#ifdef CONFIG_X86_64 /* - * Opportunistic SYSRETL: if possible, try to return using SYSRETL. - * SYSRETL is available on all 64-bit CPUs, so we don't need to - * bother with SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. + * Check that the register state is valid for using SYSRETL/SYSEXIT + * to exit to userspace. Otherwise use the slower but fully capable + * IRET exit path. */ - return regs->cs == __USER32_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; -#else - /* - * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - * - * We don't allow syscalls at all from VM86 mode, but we still - * need to check VM, because we might be returning from sys_vm86. - */ - return regs->cs == __USER_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; -#endif + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* EIP must point to the VDSO landing pad */ + if (unlikely(regs->ip != landing_pad)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) + return false; + + /* If the TF, RF, or VM flags are set, use IRET */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) + return false; + + /* Use SYSRETL/SYSEXIT to exit to userspace */ + return true; } /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ -- cgit v1.2.3