From 5a7a64779e7ad21f71824faee234b33654df57bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 00:37:47 +0000 Subject: KVM: VMX: Access @flags as a 32-bit value in __vmx_vcpu_run() Access @flags using 32-bit operands when saving and testing @flags for VMX_RUN_VMRESUME, as using 8-bit operands is unnecessarily fragile due to relying on VMX_RUN_VMRESUME being in bits 0-7. The behavior of treating @flags a single byte is a holdover from when the param was "bool launched", i.e. is not deliberate. Cc: Alexey Dobriyan Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20221119003747.2615229-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 766c6b3ef5ed..cd2f75360bf3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -69,8 +69,8 @@ SYM_FUNC_START(__vmx_vcpu_run) */ push %_ASM_ARG2 - /* Copy @flags to BL, _ASM_ARG3 is volatile. */ - mov %_ASM_ARG3B, %bl + /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3L, %ebx lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp @@ -106,7 +106,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - testb $VMX_RUN_VMRESUME, %bl + test $VMX_RUN_VMRESUME, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -128,7 +128,7 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'testb' above */ + /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ jz .Lvmlaunch /* -- cgit v1.2.3 From e8733482f59e23a8bf79526b7f83ed0d0ff5a9b2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 18 Nov 2022 20:05:21 +0300 Subject: KVM: VMX: don't use "unsigned long" in vmx_vcpu_enter_exit() __vmx_vcpu_run_flags() returns "unsigned int" and uses only 2 bits of it so using "unsigned long" is very much pointless. Furthermore, __vmx_vcpu_run() and vmx_spec_ctrl_restore_host() take an "unsigned int", i.e. actually relying on an "unsigned long" value won't work. Signed-off-by: Alexey Dobriyan Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/Y3e7UW0WNV2AZmsZ@p183 Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c788aa382611..506c324d46b1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7172,7 +7172,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx, - unsigned long flags) + unsigned int flags) { guest_state_enter_irqoff(); -- cgit v1.2.3 From fc9465be8aad2042978590d44c01350534c1ac11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:06 +0000 Subject: KVM: x86: Make vmx_get_exit_qual() and vmx_get_intr_info() noinstr-friendly Add an extra special noinstr-friendly helper to test+mark a "register" available and use it when caching vmcs.EXIT_QUALIFICATION and vmcs.VM_EXIT_INTR_INFO. Make the caching helpers __always_inline too so that they can be used in noinstr functions. A future fix will move VMX's handling of NMI exits into the noinstr vmx_vcpu_enter_exit() so that the NMI is processed before any kind of instrumentation can trigger a fault and thus IRET, i.e. so that KVM doesn't invoke the NMI handler with NMIs enabled. Cc: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_cache_regs.h | 12 ++++++++++++ arch/x86/kvm/vmx/vmx.h | 14 ++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..4c91f626c058 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -75,6 +75,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } +/* + * kvm_register_test_and_mark_available() is a special snowflake that uses an + * arch bitop directly to avoid the explicit instrumentation that comes with + * the generic bitops. This allows code that cannot be instrumented (noinstr + * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers. + */ +static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + /* * The "raw" register helpers are only for cases where the full 64 bits of a * register are read/written irrespective of current vCPU mode. In other words, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a3da84f4ea45..bb720a2f11ab 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -669,25 +669,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - } + return vmx->exit_qualification; } -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - } + return vmx->exit_intr_info; } -- cgit v1.2.3 From 8578f59657c505982e1d05232272c6bf304cf8aa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:07 +0000 Subject: KVM: VMX: Allow VM-Fail path of VMREAD helper to be instrumented Allow instrumentation in the VM-Fail path of __vmcs_readl() so that the helper can be used in noinstr functions, e.g. to get the exit reason in vmx_vcpu_enter_exit() in order to handle NMI VM-Exits in the noinstr section. While allowing instrumentation isn't technically safe, KVM has much bigger problems if VMREAD fails in a noinstr section. Note, all other VMX instructions also allow instrumentation in their VM-Fail paths for similar reasons, VMREAD was simply omitted by commit 3ebccdf373c2 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") because VMREAD wasn't used in a noinstr section at the time. Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx_ops.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index a5282014616c..db95bde52998 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: + instrumentation_begin(); WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + instrumentation_end(); return 0; do_exception: -- cgit v1.2.3 From 11633f69506d038120925691626f2851203af241 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:08 +0000 Subject: KVM: VMX: Always inline eVMCS read/write helpers Tag all evmcs_{read,write}() helpers __always_inline so that they can be freely used in noinstr sections, e.g. to get the VM-Exit reason in vcpu_vmx_enter_exit() (in a future patch). For consistency and to avoid more spot fixes in the future, e.g. see commit 010050a86393 ("x86/kvm: Always inline evmcs_write64()"), tag all accessors even though evmcs_read32() is the only anticipated use case in the near future. In practice, non-KASAN builds are all but guaranteed to inline the helpers anyways. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x107: call to evmcs_read32() leaves .noinstr.text section Reported-by: kernel test robot Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index ab08a9b9ab7d..caf658726169 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -196,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write32(unsigned long field, u32 value) +static __always_inline void evmcs_write32(unsigned long field, u32 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -208,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write16(unsigned long field, u16 value) +static __always_inline void evmcs_write16(unsigned long field, u16 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -220,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline u64 evmcs_read64(unsigned long field) +static __always_inline u64 evmcs_read64(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -230,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field) return *(u64 *)((char *)current_evmcs + offset); } -static inline u32 evmcs_read32(unsigned long field) +static __always_inline u32 evmcs_read32(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -240,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field) return *(u32 *)((char *)current_evmcs + offset); } -static inline u16 evmcs_read16(unsigned long field) +static __always_inline u16 evmcs_read16(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -274,11 +274,11 @@ static inline void evmcs_load(u64 phys_addr) void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } +static __always_inline void evmcs_write32(unsigned long field, u32 value) {} +static __always_inline void evmcs_write16(unsigned long field, u16 value) {} +static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } +static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } +static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ -- cgit v1.2.3 From 432727f1cb6e35cb3416d5aeea3dd281e460cbab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:09 +0000 Subject: KVM: VMX: Always inline to_vmx() and to_kvm_vmx() Tag to_vmx() and to_kvm_vmx() __always_inline as they both just reflect the passed in pointer (the embedded struct is the first field in the container), and drop the @vmx param from vmx_vcpu_enter_exit(), which likely existed purely to make noinstr validation happy. Amusingly, when the compiler decides to not inline the helpers, e.g. for KASAN builds, to_vmx() and to_kvm_vmx() may end up pointing at the same symbol, which generates very confusing objtool warnings. E.g. the use of to_vmx() in a future patch led to objtool complaining about to_kvm_vmx(), and only once all use of to_kvm_vmx() was commented out did to_vmx() pop up in the obj tool report. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x160: call to to_kvm_vmx() leaves .noinstr.text section Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 5 +++-- arch/x86/kvm/vmx/vmx.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 506c324d46b1..59a051350f8f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7171,9 +7171,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx, unsigned int flags) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ @@ -7291,7 +7292,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); + vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bb720a2f11ab..2acdc54bc34b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -640,12 +640,12 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); } -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } -- cgit v1.2.3 From 54a3b70a75dcde8173e3d8ccc60f9ecd7af7b5f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:10 +0000 Subject: x86/entry: KVM: Use dedicated VMX NMI entry for 32-bit kernels too Use a dedicated entry for invoking the NMI handler from KVM VMX's VM-Exit path for 32-bit even though using a dedicated entry for 32-bit isn't strictly necessary. Exposing a single symbol will allow KVM to reference the entry point in assembly code without having to resort to more #ifdefs (or #defines). identry.h is intended to be included from asm files only once, and so simply including idtentry.h in KVM assembly isn't an option. Bypassing the ESP fixup and CR3 switching in the standard NMI entry code is safe as KVM always handles NMIs that occur in the guest on a kernel stack, with a kernel CR3. Cc: Andy Lutomirski Cc: Thomas Gleixner Reviewed-by: Lai Jiangshan Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/idtentry.h | 16 ++++++---------- arch/x86/kernel/nmi.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 4 ++-- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 72184b0b2219..b241af4ce9b4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); /* NMI */ -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_KVM_INTEL) /* - * Special NOIST entry point for VMX which invokes this on the kernel - * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI - * 'executing' marker. - * - * On 32bit this just uses the regular NMI entry point because 32-bit does - * not have ISTs. + * Special entry point for VMX which invokes this on the kernel stack, even for + * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work + * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well + * to avoid more ifdeffery. */ -DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); -#else -#define asm_exc_nmi_noist asm_exc_nmi +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx); #endif DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cec0bfa3bc04..e37faba95bb5 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -527,14 +527,14 @@ nmi_restart: mds_user_clear_cpu_buffers(); } -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) -DEFINE_IDTENTRY_RAW(exc_nmi_noist) +#if IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -#endif #if IS_MODULE(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); +#endif #endif void stop_nmi(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 59a051350f8f..84e808af892a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6866,7 +6866,7 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + bool is_nmi = entry == (unsigned long)asm_exc_nmi_kvm_vmx; kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); @@ -6895,7 +6895,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_kvm_vmx; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ -- cgit v1.2.3 From 4f76e86f7e0dc33af14256d30177bf65de2f9cab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:11 +0000 Subject: KVM: VMX: Provide separate subroutines for invoking NMI vs. IRQ handlers Split the asm subroutines for handling NMIs versus IRQs that occur in the guest so that the NMI handler can be called from a noinstr section. As a bonus, the NMI path doesn't need an indirect branch. Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 70 ++++++++++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.c | 26 ++++++++--------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index cd2f75360bf3..f0519f0f738c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -31,6 +31,39 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif +.macro VMX_DO_EVENT_IRQOFF call_insn call_target + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + \call_insn \call_target + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + RET +.endm + .section .noinstr.text, "ax" /** @@ -320,35 +353,10 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) - /* - * Unconditionally create a stack frame, getting the correct RSP on the - * stack (for x86-64) would take two instructions anyways, and RBP can - * be used to restore RSP to make objtool happy (see below). - */ - push %_ASM_BP - mov %_ASM_SP, %_ASM_BP +SYM_FUNC_START(vmx_do_nmi_irqoff) + VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(vmx_do_nmi_irqoff) -#ifdef CONFIG_X86_64 - /* - * Align RSP to a 16-byte boundary (to emulate CPU behavior) before - * creating the synthetic interrupt stack frame for the IRQ/NMI. - */ - and $-16, %rsp - push $__KERNEL_DS - push %rbp -#endif - pushf - push $__KERNEL_CS - CALL_NOSPEC _ASM_ARG1 - - /* - * "Restore" RSP from RBP, even though IRET has already unwound RSP to - * the correct value. objtool doesn't know the callee will IRET and, - * without the explicit restore, thinks the stack is getting walloped. - * Using an unwind hint is problematic due to x86-64's dynamic alignment. - */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP - RET -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) +SYM_FUNC_START(vmx_do_interrupt_irqoff) + VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 +SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 84e808af892a..1d9e3fd6584e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6861,17 +6861,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -void vmx_do_interrupt_nmi_irqoff(unsigned long entry); - -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, - unsigned long entry) -{ - bool is_nmi = entry == (unsigned long)asm_exc_nmi_kvm_vmx; - - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); - vmx_do_interrupt_nmi_irqoff(entry); - kvm_after_interrupt(vcpu); -} +void vmx_do_interrupt_irqoff(unsigned long entry); +void vmx_do_nmi_irqoff(void); static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { @@ -6895,7 +6886,6 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_kvm_vmx; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6908,8 +6898,11 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); + else if (is_nmi(intr_info)) { + kvm_before_interrupt(&vmx->vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(&vmx->vcpu); + } } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6922,7 +6915,10 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); + vmx_do_interrupt_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); + vcpu->arch.at_instruction_boundary = true; } -- cgit v1.2.3 From 11df586d774f4aab1835144fd2a8dc3cb2add8d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:12 +0000 Subject: KVM: VMX: Handle NMI VM-Exits in noinstr region Move VMX's handling of NMI VM-Exits into vmx_vcpu_enter_exit() so that the NMI is handled prior to leaving the safety of noinstr. Handling the NMI after leaving noinstr exposes the kernel to potential ordering problems as an instrumentation-induced fault, e.g. #DB, #BP, #PF, etc. will unblock NMIs when IRETing back to the faulting instruction. Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmcs.h | 4 ++-- arch/x86/kvm/vmx/vmenter.S | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 34 +++++++++++++++++++++------------- arch/x86/kvm/x86.h | 6 +++--- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index ac290a44a693..7c1996b433e2 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -75,7 +75,7 @@ struct loaded_vmcs { struct vmcs_controls_shadow controls_shadow; }; -static inline bool is_intr_type(u32 intr_info, u32 type) +static __always_inline bool is_intr_type(u32 intr_info, u32 type) { const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK; @@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION); } -static inline bool is_nmi(u32 intr_info) +static __always_inline bool is_nmi(u32 intr_info) { return is_intr_type(intr_info, INTR_TYPE_NMI_INTR); } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f0519f0f738c..f550540ed54e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -299,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) +SYM_FUNC_START(vmx_do_nmi_irqoff) + VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(vmx_do_nmi_irqoff) + .section .text, "ax" @@ -353,10 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif -SYM_FUNC_START(vmx_do_nmi_irqoff) - VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx -SYM_FUNC_END(vmx_do_nmi_irqoff) - SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d9e3fd6584e..664994e3e909 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5170,8 +5170,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by + * vmx_vcpu_enter_exit(). + */ if (is_machine_check(intr_info) || is_nmi(intr_info)) - return 1; /* handled by handle_exception_nmi_irqoff() */ + return 1; /* * Queue the exception here instead of in handle_nm_fault_irqoff(). @@ -6884,7 +6889,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); @@ -6897,12 +6902,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); - kvm_after_interrupt(&vmx->vcpu); - } } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6932,7 +6931,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_nmi_irqoff(vmx); + handle_exception_irqoff(vmx); } /* @@ -7194,6 +7193,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); + if (unlikely(vmx->fail)) + vmx->exit_reason.full = 0xdead; + else + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && + is_nmi(vmx_get_intr_info(vcpu))) { + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); + } + guest_state_exit_irqoff(); } @@ -7335,12 +7346,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..44d1827f0a30 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -382,13 +382,13 @@ enum kvm_intr_type { KVM_HANDLING_NMI, }; -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, - enum kvm_intr_type intr) +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } -- cgit v1.2.3 From 41acdd41973548aec573381e1166b5a388708d5b Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 9 Nov 2022 15:54:12 +0800 Subject: KVM: VMX: Do not trap VMFUNC instructions for L1 guests. Explicitly disable VMFUNC in vmcs01 to document that KVM doesn't support any VM-Functions for L1. WARN in the dedicated VMFUNC handler if an exit occurs while L1 is active, but keep the existing handlers as fallbacks to avoid killing the VM as an unexpected VMFUNC VM-Exit isn't fatal Signed-off-by: Yu Zhang Link: https://lore.kernel.org/r/20221109075413.1405803-2-yu.c.zhang@linux.intel.com [sean: don't kill the VM on an unexpected VMFUNC from L1, reword changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 7 +++---- arch/x86/kvm/vmx/vmx.c | 6 ++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 557b9c468734..3c226de4b562 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5864,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) u32 function = kvm_rax_read(vcpu); /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. + * VMFUNC should never execute cleanly while L1 is active; KVM supports + * VMFUNC for nested VMs, but not for L1. */ - if (!is_guest_mode(vcpu)) { + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 664994e3e909..8a9911ae1240 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4590,6 +4590,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* + * KVM doesn't support VMFUNC for L1, but the control is set in KVM's + * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. + */ + exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; -- cgit v1.2.3 From 496c917b0989a7a20f9804de14ab2b3cbd6747a1 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 9 Nov 2022 15:54:13 +0800 Subject: KVM: nVMX: Simplify the setting of SECONDARY_EXEC_ENABLE_VMFUNC for nested. Values of base settings for nested proc-based VM-Execution control MSR come from the ones for non-nested. And for SECONDARY_EXEC_ENABLE_VMFUNC flag, KVM currently a) first mask off it from vmcs_conf->cpu_based_2nd_exec_ctrl; b) then check it against the same source; c) and reset it again if host has it. So just simplify this, by not masking off SECONDARY_EXEC_ENABLE_VMFUNC in the first place. No functional change. Signed-off-by: Yu Zhang Link: https://lore.kernel.org/r/20221109075413.1405803-3-yu.c.zhang@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3c226de4b562..7c4f5ca405c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6880,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_TSC_SCALING | @@ -6912,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } - } - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; /* - * Advertise EPTP switching unconditionally - * since we emulate it + * Advertise EPTP switching irrespective of hardware support, + * KVM emulates it in software so long as VMFUNC is supported. */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (cpu_has_vmx_vmfunc()) + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* -- cgit v1.2.3 From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 24 Jan 2023 00:12:08 +0200 Subject: KVM: VMX: Fix crash due to uninitialized current_vmcs KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as a nested hypervisor on top of Hyper-V. When MSR bitmap is updated, evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark that the msr bitmap was changed. vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr -> vmx_msr_bitmap_l01_changed which in the end calls this function. The function checks for current_vmcs if it is null but the check is insufficient because current_vmcs is not initialized. Because of this, the code might incorrectly write to the structure pointed by current_vmcs value left by another task. Preemption is not disabled, the current task can be preempted and moved to another CPU while current_vmcs is accessed multiple times from evmcs_touch_msr_bitmap() which leads to crash. The manipulation of MSR bitmaps by callers happens only for vmcs01 so the solution is to use vmx->vmcs01.vmcs instead of current_vmcs. BUG: kernel NULL pointer dereference, address: 0000000000000338 PGD 4e1775067 P4D 0 Oops: 0002 [#1] PREEMPT SMP NOPTI ... RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel] ... Call Trace: vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel] vmx_vcpu_create+0xe6/0x540 [kvm_intel] kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm] kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm] kvm_vm_ioctl+0x53f/0x790 [kvm] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Alexandru Matei Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 11 ----------- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index caf658726169..78d17667e7ec 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field) return *(u16 *)((char *)current_evmcs + offset); } -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - static inline void evmcs_load(u64 phys_addr) { struct hv_vp_assist_page *vp_ap = @@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ #define EVMPTR_INVALID (-1ULL) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8a9911ae1240..33614ee2cd67 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + if (evmcs->hv_enlightenments_control.msr_bitmap) + evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + } vmx->nested.force_msr_bitmap_recalc = true; } -- cgit v1.2.3