From 9dbb029b9c44a84968317d462eaa5a748317d8fb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 15 Aug 2023 00:08:35 +0200 Subject: KVM: x86: Remove redundant vcpu->arch.cr0 assignments Drop the vcpu->arch.cr0 assignment after static_call(kvm_x86_set_cr0). CR0 was already set by {vmx,svm}_set_cr0(). Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230814222358.707877-2-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/smm.c | 1 - arch/x86/kvm/x86.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index b42111a24cc2..dc3d95fdca7d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu) cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; static_call(kvm_x86_set_cr4)(vcpu, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..d960d4094a0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11521,7 +11521,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); -- cgit v1.2.3 From 4346db6e6e7a63c25a2f6dd93f4613d6a17dbd4e Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 15 Aug 2023 00:08:36 +0200 Subject: KVM: x86: Force TLB flush on userspace changes to special registers Userspace can directly modify the content of vCPU's CR0, CR3, and CR4 via KVM_SYNC_X86_SREGS and KVM_SET_SREGS{,2}. Make sure that KVM flushes guest TLB entries and paging-structure caches if a (partial) guest TLB flush is architecturally required based on the CRn changes. To keep things simple, flush whenever KVM resets the MMU context, i.e. if any bits in CR0, CR3, CR4, or EFER are modified. This is extreme overkill, but stuffing state from userspace is not such a hot path that preserving guest TLB state is a priority. Suggested-by: Paolo Bonzini Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230814222358.707877-3-mhal@rbox.co [sean: call out that the flushing on MMU context resets is for simplicity] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d960d4094a0b..ec728aef9acc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11564,8 +11564,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (ret) return ret; - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( @@ -11606,8 +11608,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) mmu_reset_needed = 1; vcpu->arch.pdptrs_from_userspace = true; } - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } return 0; } -- cgit v1.2.3 From f10a570b093e60c6bd3f210ae909f014f421352a Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 24 Aug 2023 16:52:46 -0500 Subject: KVM: x86: Add CONFIG_KVM_MAX_NR_VCPUS to allow up to 4096 vCPUs Add a Kconfig entry to set the maximum number of vCPUs per KVM guest and set the default value to 4096 when MAXSMP is enabled, as there are use cases that want to create more than the currently allowed 1024 vCPUs and are more than happy to eat the memory overhead. The Hyper-V TLFS doesn't allow more than 64 sparse banks, i.e. allows a maximum of 4096 virtual CPUs. Cap KVM's maximum number of virtual CPUs to 4096 to avoid exceeding Hyper-V's limit as KVM support for Hyper-V is unconditional, and alternatives like dynamically disabling Hyper-V enlightenments that rely on sparse banks would require non-trivial code changes. Suggested-by: Sean Christopherson Signed-off-by: Kyle Meyer Link: https://lore.kernel.org/r/20230824215244.3897419-1-kyle.meyer@hpe.com [sean: massage changelog with --verbose, document #ifdef mess] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 8 ++++++++ arch/x86/kvm/Kconfig | 11 +++++++++++ 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..c356aa27fbee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -39,7 +39,15 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ed90f148140d..950c12868d30 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -154,4 +154,15 @@ config KVM_PROVE_MMU config KVM_EXTERNAL_WRITE_TRACKING bool +config KVM_MAX_NR_VCPUS + int "Maximum number of vCPUs per KVM guest" + depends on KVM + range 1024 4096 + default 4096 if MAXSMP + default 1024 + help + Set the maximum number of vCPUs per KVM guest. Larger values will increase + the memory footprint of each KVM guest, regardless of how many vCPUs are + created for a given VM. + endif # VIRTUALIZATION -- cgit v1.2.3 From bc3d7c5570a03ab45bde4bae83697c80900fb714 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Thu, 7 Sep 2023 09:24:49 -0700 Subject: KVM: SVM: Update SEV-ES shutdown intercepts with more metadata Currently if an SEV-ES VM shuts down userspace sees KVM_RUN struct with only errno=EINVAL. This is a very limited amount of information to debug the situation. Instead return KVM_EXIT_SHUTDOWN to alert userspace the VM is shutting down and is not usable any further. Signed-off-by: Peter Gonda Suggested-by: Sean Christopherson Suggested-by: Tom Lendacky Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Tom Lendacky Cc: Joerg Roedel Cc: Borislav Petkov Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20230907162449.1739785-1-pgonda@google.com [sean: tweak changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..f36a6d819280 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2203,12 +2203,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; struct vcpu_svm *svm = to_svm(vcpu); - /* - * The VM save area has already been encrypted so it - * cannot be reinitialized - just terminate. - */ - if (sev_es_guest(vcpu->kvm)) - return -EINVAL; /* * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put @@ -2217,9 +2211,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) * userspace. At a platform view, INIT is acceptable behavior as * there exist bare metal platforms that automatically INIT the CPU * in response to shutdown. + * + * The VM save area for SEV-ES guests has already been encrypted so it + * cannot be reinitialized, i.e. synthesizing INIT is futile. */ - clear_page(svm->vmcb); - kvm_vcpu_reset(vcpu, true); + if (!sev_es_guest(vcpu->kvm)) { + clear_page(svm->vmcb); + kvm_vcpu_reset(vcpu, true); + } kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; -- cgit v1.2.3 From 9cfec6d097c607e36199cf0cfbb8cf5acbd8e9b2 Mon Sep 17 00:00:00 2001 From: Haitao Shan Date: Tue, 12 Sep 2023 16:55:45 -0700 Subject: KVM: x86: Fix lapic timer interrupt lost after loading a snapshot. When running android emulator (which is based on QEMU 2.12) on certain Intel hosts with kernel version 6.3-rc1 or above, guest will freeze after loading a snapshot. This is almost 100% reproducible. By default, the android emulator will use snapshot to speed up the next launching of the same android guest. So this breaks the android emulator badly. I tested QEMU 8.0.4 from Debian 12 with an Ubuntu 22.04 guest by running command "loadvm" after "savevm". The same issue is observed. At the same time, none of our AMD platforms is impacted. More experiments show that loading the KVM module with "enable_apicv=false" can workaround it. The issue started to show up after commit 8e6ed96cdd50 ("KVM: x86: fire timer when it is migrated and expired, and in oneshot mode"). However, as is pointed out by Sean Christopherson, it is introduced by commit 967235d32032 ("KVM: vmx: clear pending interrupts on KVM_SET_LAPIC"). commit 8e6ed96cdd50 ("KVM: x86: fire timer when it is migrated and expired, and in oneshot mode") just makes it easier to hit the issue. Having both commits, the oneshot lapic timer gets fired immediately inside the KVM_SET_LAPIC call when loading the snapshot. On Intel platforms with APIC virtualization and posted interrupt processing, this eventually leads to setting the corresponding PIR bit. However, the whole PIR bits get cleared later in the same KVM_SET_LAPIC call by apicv_post_state_restore. This leads to timer interrupt lost. The fix is to move vmx_apicv_post_state_restore to the beginning of the KVM_SET_LAPIC call and rename to vmx_apicv_pre_state_restore. What vmx_apicv_post_state_restore does is actually clearing any former apicv state and this behavior is more suitable to carry out in the beginning. Fixes: 967235d32032 ("KVM: vmx: clear pending interrupts on KVM_SET_LAPIC") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Haitao Shan Link: https://lore.kernel.org/r/20230913000215.478387-1-hshan@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 4 ++++ arch/x86/kvm/vmx/vmx.c | 4 ++-- 4 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..9b419f0de713 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -108,6 +108,7 @@ KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..f77568c6a326 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1709,6 +1709,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dcd60b39e794..c6a5a5945021 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2670,6 +2670,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) u64 msr_val; int i; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) @@ -2977,6 +2979,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..9bba5352582c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6912,7 +6912,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8286,7 +8286,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, -- cgit v1.2.3 From 629d3698f6958ee6f8131ea324af794f973b12ac Mon Sep 17 00:00:00 2001 From: Tao Su Date: Thu, 14 Sep 2023 13:55:04 +0800 Subject: KVM: x86: Clear bit12 of ICR after APIC-write VM-exit When IPI virtualization is enabled, a WARN is triggered if bit12 of ICR MSR is set after APIC-write VM-exit. The reason is kvm_apic_send_ipi() thinks the APIC_ICR_BUSY bit should be cleared because KVM has no delay, but kvm_apic_write_nodecode() doesn't clear the APIC_ICR_BUSY bit. Under the x2APIC section, regarding ICR, the SDM says: It remains readable only to aid in debugging; however, software should not assume the value returned by reading the ICR is the last written value. I.e. the guest is allowed to set bit 12. However, the SDM also gives KVM free reign to do whatever it wants with the bit, so long as KVM's behavior doesn't confuse userspace or break KVM's ABI. Clear bit 12 so that it reads back as '0'. This approach is safer than "do nothing" and is consistent with the case where IPI virtualization is disabled or not supported, i.e., handle_fastpath_set_x2apic_icr_irqoff() -> kvm_x2apic_icr_write() Opportunistically replace the TODO with a comment calling out that eating the write is likely faster than a conditional branch around the busy bit. Link: https://lore.kernel.org/all/ZPj6iF0Q7iynn62p@google.com/ Fixes: 5413bcba7ed5 ("KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode") Cc: stable@vger.kernel.org Signed-off-by: Tao Su Tested-by: Yi Lai Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20230914055504.151365-1-tao1.su@linux.intel.com [sean: tweak changelog, replace TODO with comment, drop local "val"] Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c6a5a5945021..953700e68cf6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2444,22 +2444,22 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 val; /* - * ICR is a single 64-bit register when x2APIC is enabled. For legacy - * xAPIC, ICR writes need to go down the common (slightly slower) path - * to get the upper half from ICR2. + * ICR is a single 64-bit register when x2APIC is enabled, all others + * registers hold 32-bit values. For legacy xAPIC, ICR writes need to + * go down the common path to get the upper half from ICR2. + * + * Note, using the write helpers may incur an unnecessary write to the + * virtual APIC state, but KVM needs to conditionally modify the value + * in certain cases, e.g. to clear the ICR busy bit. The cost of extra + * conditional branches is likely a wash relative to the cost of the + * maybe-unecessary write, and both are in the noise anyways. */ - if (apic_x2apic_mode(apic) && offset == APIC_ICR) { - val = kvm_lapic_get_reg64(apic, APIC_ICR); - kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); - trace_kvm_apic_write(APIC_ICR, val); - } else { - /* TODO: optimize to just emulate side effect w/o one more write */ - val = kvm_lapic_get_reg(apic, offset); - kvm_lapic_reg_write(apic, offset, (u32)val); - } + if (apic_x2apic_mode(apic) && offset == APIC_ICR) + kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + else + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); -- cgit v1.2.3 From ee11ab6bb04e1093d3cc5f5bea9779799ce1d2c7 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 5 Sep 2023 09:07:09 +0800 Subject: KVM: X86: Reduce size of kvm_vcpu_arch structure when CONFIG_KVM_XEN=n When CONFIG_KVM_XEN=n, the size of kvm_vcpu_arch can be reduced from 5100+ to 4400+ by adding macro control. Signed-off-by: Peng Hao Link: https://lore.kernel.org/all/CAPm50aKwbZGeXPK5uig18Br8CF1hOS71CE2j_dLX+ub7oJdpGg@mail.gmail.com [sean: fix whitespace damage] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 5 ++++- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/x86.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..d2a15abbcefd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -680,6 +680,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +703,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -930,8 +932,9 @@ struct kvm_vcpu_arch { bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..48f5308c4556 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -456,7 +456,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); +#ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); +#endif kvm_vcpu_after_set_cpuid(vcpu); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..a3a02d62aa6a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3232,11 +3232,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); +#ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); +#endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } -- cgit v1.2.3 From 77c9b9dea4fb3e51e0d850db7f21cb1156d987bd Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 30 Sep 2023 14:58:35 +0100 Subject: KVM: x86/xen: Use fast path for Xen timer delivery Most of the time there's no need to kick the vCPU and deliver the timer event through kvm_xen_inject_timer_irqs(). Use kvm_xen_set_evtchn_fast() directly from the timer callback, and only fall back to the slow path if delivering the timer would block, i.e. if kvm_xen_set_evtchn_fast() returns -EWOULDBLOCK. If delivery fails for any other reason, do nothing and just let it fail silently, as that is what the slow path would end up doing anyways. This gives a significant improvement in timer latency testing (using nanosleep() for various periods and then measuring the actual time elapsed). However, there was a reason[1] the fast path was dropped when this support was first added. The current code holds vcpu->mutex for all operations on the kvm->arch.timer_expires field, and the fast path introduces a potential race condition. Avoid that race by ensuring the hrtimer is (temporarily) cancelled before making changes in kvm_xen_start_timer(), and also when reading the values out for KVM_XEN_VCPU_ATTR_TYPE_TIMER. [1] https://lore.kernel.org/kvm/846caa99-2e42-4443-1070-84e49d2f11d2@redhat.com Signed-off-by: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/f21ee3bd852761e7808240d4ecaec3013c649dc7.camel@infradead.org [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 40edf4d1974c..75586da134b3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, arch.xen.timer); + struct kvm_xen_evtchn e; + int rc; + if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); + if (rc != -EWOULDBLOCK) { + vcpu->arch.xen.timer_expires = 0; + return HRTIMER_NORESTART; + } + atomic_inc(&vcpu->arch.xen.timer_pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) { + /* + * Avoid races with the old timer firing. Checking timer_expires + * to avoid calling hrtimer_cancel() will only have false positives + * so is fine. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_cancel(&vcpu->arch.xen.timer); + atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; @@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + /* + * Ensure a consistent snapshot of state is captured, with a + * timer either being pending, or the event channel delivered + * to the corresponding bit in the shared_info. Not still + * lurking in the timer_pending flag for deferred delivery. + * Purely as an optimisation, if the timer_expires field is + * zero, that means the timer isn't active (or even in the + * timer_pending flag) and there is no need to cancel it. + */ + if (vcpu->arch.xen.timer_expires) { + hrtimer_cancel(&vcpu->arch.xen.timer); + kvm_xen_inject_timer_irqs(vcpu); + } + data->u.timer.port = vcpu->arch.xen.timer_virq; data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + + /* + * The hrtimer may trigger and raise the IRQ immediately, + * while the returned state causes it to be set up and + * raised again on the destination system after migration. + * That's fine, as the guest won't even have had a chance + * to run and handle the interrupt. Asserting an already + * pending event channel is idempotent. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_start_expires(&vcpu->arch.xen.timer, + HRTIMER_MODE_ABS_HARD); + r = 0; break; -- cgit v1.2.3 From aeb904f6b9f1de588cf3130dc8a2c458b236704e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 18:36:20 -0700 Subject: KVM: x86: Refactor can_emulate_instruction() return to be more expressive Refactor and rename can_emulate_instruction() to allow vendor code to return more than true/false, e.g. to explicitly differentiate between "retry", "fault", and "unhandleable". For now, just do the plumbing, a future patch will expand SVM's implementation to signal outright failure if KVM attempts EMULTYPE_SKIP on an SEV guest. No functional change intended (or rather, none that are visible to the guest or userspace). Link: https://lore.kernel.org/r/20230825013621.2845700-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm/svm.c | 31 +++++++++++++++++-------------- arch/x86/kvm/vmx/vmx.c | 12 ++++++------ arch/x86/kvm/x86.c | 15 +++++++++------ 5 files changed, 35 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..ee2404a559af 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -126,7 +126,7 @@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..89583b410527 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1734,8 +1734,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f36a6d819280..c4e700f945f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -364,8 +364,8 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -391,7 +391,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, * right thing and treats "can't emulate" as outright failure * for EMULTYPE_SKIP. */ - if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) + if (svm_check_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0) != X86EMUL_CONTINUE) return 0; if (unlikely(!commit_side_effects)) @@ -4727,15 +4727,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } #endif -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) - return true; + return X86EMUL_CONTINUE; /* #UD and #GP should never be intercepted for SEV guests. */ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | @@ -4747,14 +4747,14 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * to guest register state. */ if (sev_es_guest(vcpu->kvm)) - return false; + return X86EMUL_RETRY_INSTR; /* * Emulation is possible if the instruction is already decoded, e.g. * when completing I/O after returning from userspace. */ if (emul_type & EMULTYPE_NO_DECODE) - return true; + return X86EMUL_CONTINUE; /* * Emulation is possible for SEV guests if and only if a prefilled @@ -4780,9 +4780,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - if (!(emul_type & EMULTYPE_SKIP)) - kvm_queue_exception(vcpu, UD_VECTOR); - return false; + if (emul_type & EMULTYPE_SKIP) + return X86EMUL_RETRY_INSTR; + + kvm_queue_exception(vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; } /* @@ -4793,7 +4795,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * table used to translate CS:RIP resides in emulated MMIO. */ if (likely(insn_len)) - return true; + return X86EMUL_CONTINUE; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -4851,6 +4853,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_inject_gp(vcpu, 0); else kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return X86EMUL_PROPAGATE_FAULT; } resume_guest: @@ -4868,7 +4871,7 @@ resume_guest: * doesn't explicitly define "ignored", i.e. doing nothing and letting * the guest spin is technically "ignoring" the access. */ - return false; + return X86EMUL_RETRY_INSTR; } static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) @@ -5028,7 +5031,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - .can_emulate_instruction = svm_can_emulate_instruction, + .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..4e453ba28320 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1657,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1669,9 +1669,9 @@ static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, */ if (to_vmx(vcpu)->exit_reason.enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); - return false; + return X86EMUL_PROPAGATE_FAULT; } - return true; + return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -5792,7 +5792,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) + if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* @@ -8341,7 +8341,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .enable_smi_window = vmx_enable_smi_window, #endif - .can_emulate_instruction = vmx_can_emulate_instruction, + .check_emulate_instruction = vmx_check_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..104e6b4520a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7474,11 +7474,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); -static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { - return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, - insn, insn_len); + return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); } int handle_ud(struct kvm_vcpu *vcpu) @@ -7488,8 +7488,10 @@ int handle_ud(struct kvm_vcpu *vcpu) int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + int r; - if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) + r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0); + if (r != X86EMUL_CONTINUE) return 1; if (fep_flags && @@ -8871,7 +8873,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); + if (r != X86EMUL_CONTINUE) return 1; vcpu->arch.l1tf_flush_l1d = true; -- cgit v1.2.3 From 00682995409696866fe43984c74c8688bdf8f0a5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 18:36:21 -0700 Subject: KVM: SVM: Treat all "skip" emulation for SEV guests as outright failures Treat EMULTYPE_SKIP failures on SEV guests as unhandleable emulation instead of simply resuming the guest, and drop the hack-a-fix which effects that behavior for the INT3/INTO injection path. If KVM can't skip an instruction for which KVM has already done partial emulation, resuming the guest is undesirable as doing so may corrupt guest state. Link: https://lore.kernel.org/r/20230825013621.2845700-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 +----------- arch/x86/kvm/x86.c | 9 +++++++-- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c4e700f945f8..b7472ad183b9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -364,8 +364,6 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } -static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -386,14 +384,6 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { - /* - * FIXME: Drop this when kvm_emulate_instruction() does the - * right thing and treats "can't emulate" as outright failure - * for EMULTYPE_SKIP. - */ - if (svm_check_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0) != X86EMUL_CONTINUE) - return 0; - if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -4781,7 +4771,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, */ if (unlikely(!insn)) { if (emul_type & EMULTYPE_SKIP) - return X86EMUL_RETRY_INSTR; + return X86EMUL_UNHANDLEABLE; kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 104e6b4520a9..cc7d29e9104b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8874,8 +8874,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool writeback = true; r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); - if (r != X86EMUL_CONTINUE) - return 1; + if (r != X86EMUL_CONTINUE) { + if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) + return 1; + + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); + return handle_emulation_failure(vcpu, emulation_type); + } vcpu->arch.l1tf_flush_l1d = true; -- cgit v1.2.3 From 6f0f23ef76be11bc6ace10df5e807f3542da8772 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Aug 2023 00:01:35 -0700 Subject: KVM: x86: Add IBPB_BRTYPE support Add support for the IBPB_BRTYPE CPUID flag, which indicates that IBPB includes branch type prediction flushing. Note, like SRSO_NO, advertise support for IBPB_BRTYPE even if it's not enumerated by in the raw CPUID, i.e. bypass the cpuid_count() in __kvm_cpu_cap_mask(). Some CPUs that gained support via a uCode patch don't report IBPB_BRTYPE via CPUID (the kernel forces the flag). Opportunistically use kvm_cpu_cap_check_and_set() for SRSO_NO instead of manually querying host support (cpu_feature_enabled() and boot_cpu_has() yield the same end result in this case). Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/79d5f5914fb42c2c62418ffbcd78f138645ded21.1692919072.git.jpoimboe@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..8f26a929d510 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -764,8 +764,8 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); - if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) - kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); + kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) -- cgit v1.2.3 From e47d86083c66525b89c7fc66cdd64d5937725563 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Aug 2023 00:01:36 -0700 Subject: KVM: x86: Add SBPB support Add support for the AMD Selective Branch Predictor Barrier (SBPB) by advertising the CPUID bit and handling PRED_CMD writes accordingly. Note, like SRSO_NO and IBPB_BRTYPE before it, advertise support for SBPB even if it's not enumerated by in the raw CPUID. Some CPUs that gained support via a uCode patch don't report SBPB via CPUID (the kernel forces the flag). Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/a4ab1e7fe50096d50fde33e739ed2da40b41ea6a.1692919072.git.jpoimboe@kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/cpuid.h | 3 ++- arch/x86/kvm/x86.c | 29 ++++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8f26a929d510..552dc4f3899b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -764,6 +764,7 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 284fa4704553..0b90532b6e26 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || + guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); } static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec728aef9acc..2b5b325e19f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3670,17 +3670,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); break; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) - return 1; + case MSR_IA32_PRED_CMD: { + u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); + + if (!msr_info->host_initiated) { + if ((!guest_has_pred_cmd_msr(vcpu))) + return 1; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + reserved_bits |= PRED_CMD_IBPB; - if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!boot_cpu_has(X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + + if (data & reserved_bits) return 1; + if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + wrmsrl(MSR_IA32_PRED_CMD, data); break; + } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) -- cgit v1.2.3 From 409f2e92a27a210fc768c5569851b4a419e6a232 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 4 Oct 2023 17:46:28 +0000 Subject: KVM: x86/xen: ignore the VCPU_SSHOTTMR_future flag Upstream Xen now ignores _VCPU_SSHOTTMR_future[1], since the only guest kernel ever to use it was buggy. By ignoring the flag the guest will always get a callback if it sets a negative timeout which upstream Xen has determined not to cause problems for any guest setting the flag. [1] https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909 Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20231004174628.2073263-1-paul@xen.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 75586da134b3..c539f18e0b60 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1423,12 +1423,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, return true; } + /* A delta <= 0 results in an immediate callback, which is what we want */ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { - *r = -ETIME; - return true; - } - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); *r = 0; return true; -- cgit v1.2.3 From 5d6d6a7d7e66a98bb3432478d226c68c219aaba3 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Oct 2023 10:16:10 +0100 Subject: KVM: x86: Refine calculation of guest wall clock to use a single TSC read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When populating the guest's PV wall clock information, KVM currently does a simple 'kvm_get_real_ns() - get_kvmclock_ns(kvm)'. This is an antipattern which should be avoided; when working with the relationship between two clocks, it's never correct to obtain one of them "now" and then the other at a slightly different "now" after an unspecified period of preemption (which might not even be under the control of the kernel, if this is an L1 hosting an L2 guest under nested virtualization). Add a kvm_get_wall_clock_epoch() function to return the guest wall clock epoch in nanoseconds using the same method as __get_kvmclock() — by using kvm_get_walltime_and_clockread() to calculate both the wall clock and KVM clock time from a *single* TSC reading. The condition using get_cpu_tsc_khz() is equivalent to the version in __get_kvmclock() which separately checks for the CONSTANT_TSC feature or the per-CPU cpu_tsc_khz. Which is what get_cpu_tsc_khz() does anyway. Signed-off-by: David Woodhouse Link: https://lore.kernel.org/r/bfc6d3d7cfb88c47481eabbf5a30a264c58c7789.camel@infradead.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----- arch/x86/kvm/x86.h | 1 + arch/x86/kvm/xen.c | 4 +-- 3 files changed, 81 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b5b325e19f2..42b33ef997a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. We do the reverse here. - */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); - wc.nsec = do_div(wall_nsec, 1000000000); + wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; @@ -3241,6 +3236,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; } +/* + * The pvclock_wall_clock ABI tells the guest the wall clock time at + * which it started (i.e. its epoch, when its kvmclock was zero). + * + * In fact those clocks are subtly different; wall clock frequency is + * adjusted by NTP and has leap seconds, while the kvmclock is a + * simple function of the TSC without any such adjustment. + * + * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between + * that and kvmclock, but even that would be subject to change over + * time. + * + * Attempt to calculate the epoch at a given moment using the *same* + * TSC reading via kvm_get_walltime_and_clockread() to obtain both + * wallclock and kvmclock times, and subtracting one from the other. + * + * Fall back to using their values at slightly different moments by + * calling ktime_get_real_ns() and get_kvmclock_ns() separately. + */ +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct pvclock_vcpu_time_info hv_clock; + struct kvm_arch *ka = &kvm->arch; + unsigned long seq, local_tsc_khz; + struct timespec64 ts; + uint64_t host_tsc; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + + local_tsc_khz = 0; + if (!ka->use_master_clock) + break; + + /* + * The TSC read and the call to get_cpu_tsc_khz() must happen + * on the same CPU. + */ + get_cpu(); + + local_tsc_khz = get_cpu_tsc_khz(); + + if (local_tsc_khz && + !kvm_get_walltime_and_clockread(&ts, &host_tsc)) + local_tsc_khz = 0; /* Fall back to old method */ + + put_cpu(); + + /* + * These values must be snapshotted within the seqcount loop. + * After that, it's just mathematics which can happen on any + * CPU at any time. + */ + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); + + /* + * If the conditions were right, and obtaining the wallclock+TSC was + * successful, calculate the KVM clock at the corresponding time and + * subtract one from the other to get the guest's epoch in nanoseconds + * since 1970-01-01. + */ + if (local_tsc_khz) { + kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - + __pvclock_read_cycles(&hv_clock, host_tsc); + } +#endif + return ktime_get_real_ns() - get_kvmclock_ns(kvm); +} + /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1e7be1f6ab29..5184fde1dc54 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 40edf4d1974c..b946d9f28030 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) * This code mirrors kvm_write_wall_clock() except that it writes * directly through the pfn cache and doesn't mark the page dirty. */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); @@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) wc_version = wc->version = (wc->version + 1) | 1; smp_wmb(); - wc->nsec = do_div(wall_nsec, 1000000000); + wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); wc->sec = (u32)wall_nsec; *wc_sec_hi = wall_nsec >> 32; smp_wmb(); -- cgit v1.2.3 From 598a790fc20f06e5582c939a4c5864ff1105c477 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 29 Sep 2023 16:02:44 -0700 Subject: KVM: x86: Allow HWCR.McStatusWrEn to be cleared once set When HWCR is set to 0, store 0 in vcpu->arch.msr_hwcr. Fixes: 191c8137a939 ("x86/kvm: Implement HWCR support") Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20230929230246.1954854-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42b33ef997a5..6f4534046803 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3791,12 +3791,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x8; /* ignore TLB cache disable */ /* Handle McStatusWrEn */ - if (data == BIT_ULL(18)) { - vcpu->arch.msr_hwcr = data; - } else if (data != 0) { + if (data & ~BIT_ULL(18)) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } + vcpu->arch.msr_hwcr = data; break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { -- cgit v1.2.3 From 8b0e00fba93449ecdda2c641e90c9b1f25f46669 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 29 Sep 2023 16:02:45 -0700 Subject: KVM: x86: Virtualize HWCR.TscFreqSel[bit 24] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On certain CPUs, Linux guests expect HWCR.TscFreqSel[bit 24] to be set. If it isn't set, they complain: [Firmware Bug]: TSC doesn't count with P0 frequency! Allow userspace (and the guest) to set this bit in the virtual HWCR to eliminate the above complaint. Allow the guest to write the bit even though its is R/O on *some* CPUs. Like many bits in HWRC, TscFreqSel is not architectural at all. On Family 10h[1], it was R/W and powered on as 0. In Family 15h, one of the "changes relative to Family 10H Revision D processors[2] was: • MSRC001_0015 [Hardware Configuration (HWCR)]: • Dropped TscFreqSel; TSC can no longer be selected to run at NB P0-state. Despite the "Dropped" above, that same document later describes HWCR[bit 24] as follows: TscFreqSel: TSC frequency select. Read-only. Reset: 1. 1=The TSC increments at the P0 frequency If the guest clears the bit, the worst case scenario is the guest will be no worse off than it is today, e.g. the whining may return after a guest clears the bit and kexec()'s into a new kernel. [1] https://www.amd.com/content/dam/amd/en/documents/archived-tech-docs/programmer-references/31116.pdf [2] https://www.amd.com/content/dam/amd/en/documents/archived-tech-docs/programmer-references/42301_15h_Mod_00h-0Fh_BKDG.pdf, Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20230929230246.1954854-3-jmattson@google.com [sean: elaborate on why the bit is writable by the guest] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f4534046803..6c3485215994 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3790,8 +3790,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - /* Handle McStatusWrEn */ - if (data & ~BIT_ULL(18)) { + /* + * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 + * through at least v6.6 whine if TscFreqSel is clear, + * depending on F/M/S. + */ + if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } -- cgit v1.2.3 From 1affe455d66de4e1743514afdeb99cb598a25aa2 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 14 Jul 2023 14:50:06 +0800 Subject: KVM: x86/mmu: Add helpers to return if KVM honors guest MTRRs Add helpers to check if KVM honors guest MTRRs instead of open coding the logic in kvm_tdp_page_fault(). Future fixes and cleanups will also need to determine if KVM should honor guest MTRRs, e.g. for CR0.CD toggling and and non-coherent DMA transitions. Provide an inner helper, __kvm_mmu_honors_guest_mtrrs(), so that KVM can check if guest MTRRs were honored when stopping non-coherent DMA. Note, there is no need to explicitly check that TDP is enabled, KVM clears shadow_memtype_mask when TDP is disabled, i.e. it's non-zero if and only if EPT is enabled. Suggested-by: Sean Christopherson Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20230714065006.20201-1-yan.y.zhao@intel.com Link: https://lore.kernel.org/r/20230714065043.20258-1-yan.y.zhao@intel.com [sean: squash into a one patch, drop explicit TDP check massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 7 +++++++ arch/x86/kvm/mmu/mmu.c | 25 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 253fb2093d5d..bb8c86eefac0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -237,6 +237,13 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma); + +static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) +{ + return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); +} + void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7901cb4d2fa..5d3dc7119e57 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4479,21 +4479,28 @@ out_unlock: } #endif -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) { /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. If the host MTRRs are ignored by TDP - * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA - * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype - * from the guest's MTRRs so that guest accesses to memory that is - * DMA'd aren't cached against the guest's wishes. + * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the + * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is + * to honor the memtype from the guest's MTRRs so that guest accesses + * to memory that is DMA'd aren't cached against the guest's wishes. * * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, * e.g. KVM will force UC memtype for host MMIO. */ - if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + return vm_has_noncoherent_dma && shadow_memtype_mask; +} + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. + */ + if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); gfn_t base = gfn_round_for_level(fault->gfn, -- cgit v1.2.3 From 7a18c7c2b69a2e47e95e2a381f9b01b9aae36747 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 14 Jul 2023 14:51:22 +0800 Subject: KVM: x86/mmu: Zap SPTEs when CR0.CD is toggled iff guest MTRRs are honored Zap SPTEs when CR0.CD is toggled if and only if KVM's MMU is honoring guest MTRRs, which is the only time that KVM incorporates the guest's CR0.CD into the final memtype. Suggested-by: Chao Gao Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20230714065122.20315-1-yan.y.zhao@intel.com [sean: rephrase shortlog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..dc88b510db56 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && + kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } -- cgit v1.2.3 From 9a3768191d95a9cc9a74f67ae971764def45acb4 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 14 Jul 2023 14:51:56 +0800 Subject: KVM: x86/mmu: Zap SPTEs on MTRR update iff guest MTRRs are honored When guest MTRRs are updated, zap SPTEs and do zap range calcluation if and only if KVM's MMU is honoring guest MTRRs, which is the only time that KVM incorporates the guest's MTRR type into the final memtype. Suggested-by: Chao Gao Suggested-by: Sean Christopherson Cc: Kai Huang Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20230714065156.20375-1-yan.y.zhao@intel.com [sean: rephrase shortlog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3eb6e7f47e96..a67c28a56417 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -320,7 +320,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) -- cgit v1.2.3 From bf328e22e47242695445a33da6887ba2ef9aec19 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 8 Oct 2023 10:53:35 +0800 Subject: KVM: x86: Don't sync user-written TSC against startup values The legacy API for setting the TSC is fundamentally broken, and only allows userspace to set a TSC "now", without any way to account for time lost between the calculation of the value, and the kernel eventually handling the ioctl. To work around this, KVM has a hack which, if a TSC is set with a value which is within a second's worth of the last TSC "written" to any vCPU in the VM, assumes that userspace actually intended the two TSC values to be in sync and adjusts the newly-written TSC value accordingly. Thus, when a VMM restores a guest after suspend or migration using the legacy API, the TSCs aren't necessarily *right*, but at least they're in sync. This trick falls down when restoring a guest which genuinely has been running for less time than the 1 second of imprecision KVM allows for in in the legacy API. On *creation*, the first vCPU starts its TSC counting from zero, and the subsequent vCPUs synchronize to that. But then when the VMM tries to restore a vCPU's intended TSC, because the VM has been alive for less than 1 second and KVM's default TSC value for new vCPU's is '0', the intended TSC is within a second of the last "written" TSC and KVM incorrectly adjusts the intended TSC in an attempt to synchronize. But further hacks can be piled onto KVM's existing hackish ABI, and declare that the *first* value written by *userspace* (on any vCPU) should not be subject to this "correction", i.e. KVM can assume that the first write from userspace is not an attempt to sync up with TSC values that only come from the kernel's default vCPU creation. To that end: Add a flag, kvm->arch.user_set_tsc, protected by kvm->arch.tsc_write_lock, to record that a TSC for at least one vCPU in the VM *has* been set by userspace, and make the 1-second slop hack only trigger if user_set_tsc is already set. Note that userspace can explicitly request a *synchronization* of the TSC by writing zero. For the purpose of user_set_tsc, an explicit synchronization counts as "setting" the TSC, i.e. if userspace then subsequently writes an explicit non-zero value which happens to be within 1 second of the previous value, the new value will be "corrected". This behavior is deliberate, as treating explicit synchronization as "setting" the TSC preserves KVM's existing behaviour inasmuch as possible (KVM always applied the 1-second "correction" regardless of whether the write came from userspace vs. the kernel). Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217423 Suggested-by: Oliver Upton Original-by: Oliver Upton Original-by: Sean Christopherson Signed-off-by: Like Xu Tested-by: Yong He Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231008025335.7419-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c356aa27fbee..da497cc56e84 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1332,6 +1332,7 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c3485215994..756bdc1127ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2709,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, kvm_track_tsc_matching(vcpu); } -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { + u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; @@ -2725,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* - * detection of vcpu initialization -- need to sync - * with other vCPUs. This particularly helps to keep - * kvm_clock stable after CPU hotplug + * Force synchronization when creating a vCPU, or when + * userspace explicitly writes a zero value. */ synchronizing = true; - } else { + } else if (kvm->arch.user_set_tsc) { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* - * Special case: TSC write with a small delta (1 second) - * of virtual cycle time against real time is - * interpreted as an attempt to synchronize the CPU. + * Here lies UAPI baggage: when a user-initiated TSC write has + * a small delta (1 second) of virtual cycle time against the + * previously set vCPU, we assume that they were intended to be + * in sync and the delta was only due to the racy nature of the + * legacy API. + * + * This trick falls down when restoring a guest which genuinely + * has been running for less time than the 1 second of imprecision + * which we allow for in the legacy API. In this case, the first + * value written by userspace (on any vCPU) should not be subject + * to this 'correction' to make it sync up with values that only + * come from the kernel's default vCPU creation. Make the 1-second + * slop hack only trigger if the user_set_tsc flag is already set. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } + if (user_value) + kvm->arch.user_set_tsc = true; + /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the @@ -3870,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_TSC: if (msr_info->host_initiated) { - kvm_synchronize_tsc(vcpu, data); + kvm_synchronize_tsc(vcpu, &data); } else { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); @@ -5629,6 +5642,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); + kvm->arch.user_set_tsc = true; __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -12055,7 +12069,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - kvm_synchronize_tsc(vcpu, 0); + kvm_synchronize_tsc(vcpu, NULL); vcpu_put(vcpu); /* poll control enabled by default */ -- cgit v1.2.3 From 362ff6dca5416e20646badf09394b9a8bfc000db Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 14 Jul 2023 14:52:23 +0800 Subject: KVM: x86/mmu: Zap KVM TDP when noncoherent DMA assignment starts/stops Zap KVM TDP when noncoherent DMA assignment starts (noncoherent dma count transitions from 0 to 1) or stops (noncoherent dma count transitions from 1 to 0). Before the zap, test if guest MTRR is to be honored after the assignment starts or was honored before the assignment stops. When there's no noncoherent DMA device, EPT memory type is ((MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT) When there're noncoherent DMA devices, EPT memory type needs to honor guest CR0.CD and MTRR settings. So, if noncoherent DMA count transitions between 0 and 1, EPT leaf entries need to be zapped to clear stale memory type. This issue might be hidden when the device is statically assigned with VFIO adding/removing MMIO regions of the noncoherent DMA devices for several times during guest boot, and current KVM MMU will call kvm_mmu_zap_all_fast() on the memslot removal. But if the device is hot-plugged, or if the guest has mmio_always_on for the device, the MMIO regions of it may only be added for once, then there's no path to do the EPT entries zapping to clear stale memory type. Therefore do the EPT zapping when noncoherent assignment starts/stops to ensure stale entries cleaned away. Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20230714065223.20432-1-yan.y.zhao@intel.com [sean: fix misspelled words in comment and changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dc88b510db56..76d1a6421d46 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13188,15 +13188,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); +static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) +{ + /* + * Non-coherent DMA assignment and de-assignment will affect + * whether KVM honors guest MTRRs and cause changes in memtypes + * in TDP. + * So, pass %true unconditionally to indicate non-coherent DMA was, + * or will be involved, and that zapping SPTEs might be necessary. + */ + if (__kvm_mmu_honors_guest_mtrrs(true)) + kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); +} + void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { - atomic_inc(&kvm->arch.noncoherent_dma_count); + if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { - atomic_dec(&kvm->arch.noncoherent_dma_count); + if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); -- cgit v1.2.3 From c9f65a3f2d92e0de336177d0151dabaf3ed004e5 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 14 Jul 2023 14:53:26 +0800 Subject: KVM: VMX: drop IPAT in memtype when CD=1 for KVM_X86_QUIRK_CD_NW_CLEARED For KVM_X86_QUIRK_CD_NW_CLEARED is on, remove the IPAT (ignore PAT) bit in EPT memory types when cache is disabled and non-coherent DMA are present. To correctly emulate CR0.CD=1, UC + IPAT are required as memtype in EPT. However, as with commit fb279950ba02 ("KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED"), WB + IPAT are now returned to workaround a BIOS issue that guest MTRRs are enabled too late. Without this workaround, a super slow guest boot-up is expected during the pre-guest-MTRR-enabled period due to UC as the effective memory type for all guest memory. Absent emulating CR0.CD=1 with UC, it makes no sense to set IPAT when KVM is honoring the guest memtype. Removing the IPAT bit in this patch allows effective memory type to honor PAT values as well, as WB is the weakest memtype. It means if a guest explicitly claims UC as the memtype in PAT, the effective memory is UC instead of previous WB. If, for some unknown reason, a guest meets a slow boot-up issue with the removal of IPAT, it's desired to fix the blamed PAT in the guest. Returning guest MTRR type as if CR0.CD=0 is also not preferred because KVMs ABI for the quirk also requires KVM to force WB memtype regardless of guest MTRRs to workaround the slow guest boot-up issue. In the future, honoring guest PAT will also allow KVM to more precisely zap SPTEs when the effective memtype changes. E.g. by not forcing WB when CR0.CD=1, instead of zapping SPTEs when guest MTRRs change, KVM can skip MTRR-induced zaps if CR0.CD=1 and zap SPTEs for non-WB MTRR ranges when CR0.CD is toggled (WB MTRR SPTEs can be kept because they're WB regardless of CR0.CD). The change of removing IPAT has been verified with normal boot-up time on old OVMF of commit c9e5618f84b0cb54a9ac2d7604f7b7e7859b45a7 as well, dated back to Apr 14 2015. Suggested-by: Sean Christopherson Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20230714065326.20557-1-yan.y.zhao@intel.com [sean: massage changelog to apply patch without full series] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..36724e839269 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7579,8 +7579,6 @@ static int vmx_vm_init(struct kvm *kvm) static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - u8 cache; - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. * We have to be careful as to what are honored and when. @@ -7607,11 +7605,10 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; + return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; else - cache = MTRR_TYPE_UNCACHABLE; - - return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | + VMX_EPT_IPAT_BIT; } return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; -- cgit v1.2.3 From 26951ec8623e915823985e86d2c428213f110659 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 13 Oct 2023 19:30:20 +0800 Subject: KVM: x86: Use octal for file permission Convert all module params to octal permissions to improve code readability and to make checkpatch happy: WARNING: Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'. Signed-off-by: Peng Hao Link: https://lore.kernel.org/r/20231013113020.77523-1-flyingpeng@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 20 ++++++++++---------- arch/x86/kvm/x86.c | 18 +++++++++--------- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..3872c4f3689c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; -module_param(nested, int, S_IRUGO); +module_param(nested, int, 0444); /* enable/disable Next RIP Save */ int nrips = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..f17cb5f84f4e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); +module_param_named(vnmi, enable_vnmi, bool, 0444); bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); +module_param_named(flexpriority, flexpriority_enabled, bool, 0444); bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); +module_param_named(ept, enable_ept, bool, 0444); bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); + enable_unrestricted_guest, bool, 0444); bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); +module_param_named(eptad, enable_ept_ad_bits, bool, 0444); static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); +module_param(emulate_invalid_guest_state, bool, 0444); static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); +module_param(fasteoi, bool, 0444); -module_param(enable_apicv, bool, S_IRUGO); +module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); @@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444); * use VMX instructions. */ static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); +module_param(nested, bool, 0444); bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); +module_param_named(pml, enable_pml, bool, 0444); static bool __read_mostly error_on_inconsistent_vmcs_config = true; module_param(error_on_inconsistent_vmcs_config, bool, 0444); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 756bdc1127ff..94ccad95f7e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; -module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +module_param(report_ignored_msrs, bool, 0644); EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +module_param(min_timer_period_us, uint, 0644); static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); +module_param(kvmclock_periodic_sync, bool, 0444); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +module_param(tsc_tolerance_ppm, uint, 0644); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables @@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +module_param(lapic_timer_advance_ns, int, 0644); static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); +module_param(vector_hashing, bool, 0444); bool __read_mostly enable_vmware_backdoor = false; -module_param(enable_vmware_backdoor, bool, S_IRUGO); +module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); /* @@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix; module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; -module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; -- cgit v1.2.3 From 3d30bfcbdc268b41a7d9253a9cb3082281223b19 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 16 Oct 2023 15:12:28 -0700 Subject: KVM: x86/mmu: Stop kicking vCPUs to sync the dirty log when PML is disabled Stop kicking vCPUs in kvm_arch_sync_dirty_log() when PML is disabled. Kicking vCPUs when PML is disabled serves no purpose and could negatively impact guest performance. This restores KVM's behavior to prior to 5.12 commit a018eba53870 ("KVM: x86: Move MMU's PML logic to common code"), which replaced a static_call_cond(kvm_x86_flush_log_dirty) with unconditional calls to kvm_vcpu_kick(). Fixes: a018eba53870 ("KVM: x86: Move MMU's PML logic to common code") Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20231016221228.1348318-1-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94ccad95f7e0..051ca1978442 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6355,6 +6355,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; + if (!kvm_x86_ops.cpu_dirty_log_size) + return; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } -- cgit v1.2.3 From 5a989bbead4cc9cd67c3911226ad857d293643d8 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 17 Oct 2023 23:26:10 +0000 Subject: KVM: x86: Update the variable naming in kvm_x86_ops.sched_in() Update the variable with name 'kvm' in kvm_x86_ops.sched_in() to 'vcpu' to avoid confusions. Variable naming in KVM has a clear convention that 'kvm' refers to pointer of type 'struct kvm *', while 'vcpu' refers to pointer of type 'struct kvm_vcpu *'. Fix this 9-year old naming issue for fun. Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20231017232610.4008690-1-mizhang@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da497cc56e84..63e08b17a7b7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1701,7 +1701,7 @@ struct kvm_x86_ops { void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero -- cgit v1.2.3 From d6800af51c76b6dae20e6023bbdc9b3da3ab5121 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 17 Oct 2023 15:51:02 +0000 Subject: KVM: x86: hyper-v: Don't auto-enable stimer on write from user-space Don't apply the stimer's counter side effects when modifying its value from user-space, as this may trigger spurious interrupts. For example: - The stimer is configured in auto-enable mode. - The stimer's count is set and the timer enabled. - The stimer expires, an interrupt is injected. - The VM is live migrated. - The stimer config and count are deserialized, auto-enable is ON, the stimer is re-enabled. - The stimer expires right away, and injects an unwarranted interrupt. Cc: stable@vger.kernel.org Fixes: 1f4b34f825e8 ("kvm/x86: Hyper-V SynIC timers") Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231017155101.40677-1-nsaenz@amazon.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7c2dac6824e2..238afd7335e4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; - if (stimer->count == 0) - stimer->config.enable = 0; - else if (stimer->config.auto_enable) - stimer->config.enable = 1; + if (!host) { + if (stimer->count == 0) + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; + } if (stimer->config.enable) stimer_mark_pending(stimer, false); -- cgit v1.2.3 From 2081a8450ef803e2e1136a80975eebc8682879b5 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sun, 1 Oct 2023 14:36:37 -0700 Subject: KVM: x86: remove always-false condition in kvmclock_sync_fn The 'kvmclock_periodic_sync' is a readonly param that cannot change after bootup. The kvm_arch_vcpu_postcreate() is not going to schedule the kvmclock_sync_work if kvmclock_periodic_sync == false. As a result, the "if (!kvmclock_periodic_sync)" can never be true if the kvmclock_sync_work = kvmclock_sync_fn() is scheduled. Link: https://lore.kernel.org/kvm/a461bf3f-c17e-9c3f-56aa-726225e8391d@oracle.com Signed-off-by: Dongli Zhang Link: https://lore.kernel.org/r/20231001213637.76686-1-dongli.zhang@oracle.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 051ca1978442..2b1d8e22502b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3374,9 +3374,6 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); - if (!kvmclock_periodic_sync) - return; - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); -- cgit v1.2.3 From 329369caeccb1a0cb0dc5f864d6a671ed8190c11 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 4 Oct 2023 20:12:37 -0700 Subject: x86: KVM: Add feature flag for CPUID.80000021H:EAX[bit 1] Define an X86_FEATURE_* flag for CPUID.80000021H:EAX.[bit 1], and advertise the feature to userspace via KVM_GET_SUPPORTED_CPUID. Per AMD's "Processor Programming Reference (PPR) for AMD Family 19h Model 61h, Revision B1 Processors (56713-B1-PUB)," this CPUID bit indicates that a WRMSR to MSR_FS_BASE, MSR_GS_BASE, or MSR_KERNEL_GS_BASE is non-serializing. This is a change in previously architected behavior. Effectively, this CPUID bit is a "defeature" bit, or a reverse polarity feature bit. When this CPUID bit is clear, the feature (serialization on WRMSR to any of these three MSRs) is available. When this CPUID bit is set, the feature is not available. KVM_GET_SUPPORTED_CPUID must pass this bit through from the underlying hardware, if it is set. Leaving the bit clear claims that WRMSR to these three MSRs will be serializing in a guest running under KVM. That isn't true. Though KVM could emulate the feature by intercepting writes to the specified MSRs, it does not do so today. The guest is allowed direct read/write access to these MSRs without interception, so the innate hardware behavior is preserved under KVM. Signed-off-by: Jim Mattson Reviewed-by: Maxim Levitsky Reviewed-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231005031237.1652871-1-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 58cb9495e40f..4af140cf5719 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -443,6 +443,7 @@ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 552dc4f3899b..9b2d65d62d78 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -761,7 +761,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ + F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | + F(WRMSR_XX_BASE_NS) ); kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); -- cgit v1.2.3 From 1de9992f9de0a92b6e11133aba0e2be833c11084 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 6 Sep 2023 02:20:06 +0800 Subject: KVM: x86/mmu: Remove unnecessary ‘NULL’ values from sptep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't initialize "spte" and "sptep" in fast_page_fault() as they are both guaranteed (for all intents and purposes) to be written at the start of every loop iteration. Add a sanity check that "sptep" is non-NULL after walking the shadow page tables, as encountering a NULL root would result in "spte" not being written, i.e. would lead to uninitialized data or the previous value being consumed. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20230905182006.2964-1-zeming@nfschina.com [sean: rewrite changelog with --verbose] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5d3dc7119e57..b0f01d605617 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3425,8 +3425,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; - u64 spte = 0ull; - u64 *sptep = NULL; + u64 spte; + u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(fault)) @@ -3442,6 +3442,14 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + /* + * It's entirely possible for the mapping to have been zapped + * by a different task, but the root page should always be + * available as the vCPU holds a reference to its root(s). + */ + if (WARN_ON_ONCE(!sptep)) + spte = REMOVED_SPTE; + if (!is_shadow_present_pte(spte)) break; -- cgit v1.2.3 From 122ae01c5159eb1584e07907aec89e2470622baa Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Thu, 19 Oct 2023 12:33:36 +0800 Subject: KVM: x86: remove the unused assigned_dev_head from kvm_arch Legacy device assignment was dropped years ago. This field is not used anymore. Signed-off-by: Liang Chen Link: https://lore.kernel.org/r/20231019043336.8998-1-liangchen.linux@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 63e08b17a7b7..f980a2a9dd7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1284,7 +1284,6 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b1d8e22502b..923eeca1c971 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12425,7 +12425,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) goto out_uninit_mmu; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ -- cgit v1.2.3 From 2770d4722036d6bd24bcb78e9cd7f6e572077d03 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Thu, 19 Oct 2023 18:06:57 +0200 Subject: KVM: x86: Ignore MSR_AMD64_TW_CFG access Hyper-V enabled Windows Server 2022 KVM VM cannot be started on Zen1 Ryzen since it crashes at boot with SYSTEM_THREAD_EXCEPTION_NOT_HANDLED + STATUS_PRIVILEGED_INSTRUCTION (in other words, because of an unexpected #GP in the guest kernel). This is because Windows tries to set bit 8 in MSR_AMD64_TW_CFG and can't handle receiving a #GP when doing so. Give this MSR the same treatment that commit 2e32b7190641 ("x86, kvm: Add MSR_AMD64_BU_CFG2 to the list of ignored MSRs") gave MSR_AMD64_BU_CFG2 under justification that this MSR is baremetal-relevant only. Although apparently it was then needed for Linux guests, not Windows as in this case. With this change, the aforementioned guest setup is able to finish booting successfully. This issue can be reproduced either on a Summit Ridge Ryzen (with just "-cpu host") or on a Naples EPYC (with "-cpu host,stepping=1" since EPYC is ordinarily stepping 2). Alternatively, userspace could solve the problem by using MSR filters, but forcing every userspace to define a filter isn't very friendly and doesn't add much, if any, value. The only potential hiccup is if one of these "baremetal-only" MSRs ever requires actual emulation and/or has F/M/S specific behavior. But if that happens, then KVM can still punt *that* handling to userspace since userspace MSR filters "win" over KVM's default handling. Signed-off-by: Maciej S. Szmigiero Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1ce85d9c7c9e9632393816cf19c902e0a3f411f1.1697731406.git.maciej.szmigiero@oracle.com [sean: call out MSR filtering alternative] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/x86.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d111350197f..8bcbebb56b8f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -553,6 +553,7 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 923eeca1c971..725df1f2e2b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3722,6 +3722,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; @@ -4168,6 +4169,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power -- cgit v1.2.3 From fad505b2cb838fb52cb72fa22830824c80330f2f Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Mon, 2 Oct 2023 04:08:39 +0000 Subject: KVM: x86: Service NMI requests after PMI requests in VM-Enter path Service NMI and SMI requests after PMI requests in vcpu_enter_guest() so that KVM does not need to cancel and redo the VM-Enter if the guest configures its PMIs to be delivered as NMIs (likely) or SMIs (unlikely). Because APIC emulation "injects" NMIs via KVM_REQ_NMI, handling PMI requests after NMI requests (the likely case) means KVM won't detect the pending NMI request until the final check for outstanding requests. Detecting requests at the final stage is costly as KVM has already loaded guest state, potentially queued events for injection, disabled IRQs, dropped SRCU, etc., most of which needs to be unwound. Note that changing the order of request processing doesn't change the end result, as KVM's final check for outstanding requests prevents entering the guest until all requests are serviced. I.e. KVM will ultimately coalesce events (or not) regardless of the ordering. Using SPEC2017 benchmark programs running along with Intel vtune in a VM demonstrates that the following code change reduces 800~1500 canceled VM-Enters per second. Some glory details: Probe the invocation to vmx_cancel_injection(): $ perf probe -a vmx_cancel_injection $ perf stat -a -e probe:vmx_cancel_injection -I 10000 # per 10 seconds Partial results when SPEC2017 with Intel vtune are running in the VM: On kernel without the change: 10.010018010 14254 probe:vmx_cancel_injection 20.037646388 15207 probe:vmx_cancel_injection 30.078739816 15261 probe:vmx_cancel_injection 40.114033258 15085 probe:vmx_cancel_injection 50.149297460 15112 probe:vmx_cancel_injection 60.185103088 15104 probe:vmx_cancel_injection On kernel with the change: 10.003595390 40 probe:vmx_cancel_injection 20.017855682 31 probe:vmx_cancel_injection 30.028355883 34 probe:vmx_cancel_injection 40.038686298 31 probe:vmx_cancel_injection 50.048795162 20 probe:vmx_cancel_injection 60.069057747 19 probe:vmx_cancel_injection Suggested-by: Sean Christopherson Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20231002040839.2630027-1-mizhang@google.com [sean: hoist PMU/PMI above SMI too, massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41cce5031126..6c75efa7a54b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10587,16 +10587,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_PMU, vcpu)) + kvm_pmu_handle_event(vcpu); + if (kvm_check_request(KVM_REQ_PMI, vcpu)) + kvm_pmu_deliver_pmi(vcpu); #ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); #endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_pmu_handle_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_pmu_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); if (test_bit(vcpu->arch.pending_ioapic_eoi, -- cgit v1.2.3