From 574c3c55e969096cea770eda3375ff35ccf91702 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 13:17:04 -0800 Subject: KVM: x86/mmu: Fix TLB flush range when handling disconnected pt When recursively clearing out disconnected pts, the range based TLB flush in handle_removed_tdp_mmu_page uses the wrong starting GFN, resulting in the flush mostly missing the affected range. Fix this by using base_gfn for the flush. In response to feedback from David Matlack on the RFC version of this patch, also move a few definitions into the for loop in the function to prevent unintended references to them in the future. Fixes: a066e61f13cf ("KVM: x86/mmu: Factor out handling of removed page tables") CC: stable@vger.kernel.org Signed-off-by: Ben Gardon Message-Id: <20211115211704.2621644-1-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a54c3491af42..377a96718a2e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); -- cgit v1.2.3 From 9dba4d24cbb5524dd39ab1e08886373b17f07ff2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 17 Nov 2021 08:16:17 +0100 Subject: x86/kvm: remove unused ack_notifier callbacks Commit f52447261bc8c2 ("KVM: irq ack notification") introduced an ack_notifier() callback in struct kvm_pic and in struct kvm_ioapic without using them anywhere. Remove those callbacks again. Signed-off-by: Juergen Gross Message-Id: <20211117071617.19504-1-jgross@suse.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.h | 1 - arch/x86/kvm/irq.h | 1 - 2 files changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index e66e620c3bed..539333ac4b38 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -81,7 +81,6 @@ struct kvm_ioapic { unsigned long irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; - void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; struct delayed_work eoi_inject; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 650642b18d15..c2d7cfe82d00 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -56,7 +56,6 @@ struct kvm_pic { struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; - void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; -- cgit v1.2.3 From c7785d85b6c6cc9f3d0f1a8cab128f4062b30abb Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Nov 2021 17:20:39 +0800 Subject: KVM: x86/mmu: Skip tlb flush if it has been done in zap_gfn_range() If the parameter flush is set, zap_gfn_range() would flush remote tlb when yield, then tlb flush is not needed outside. So use the return value of zap_gfn_range() directly instead of OR on it in kvm_unmap_gfn_range() and kvm_tdp_mmu_unmap_gfn_range(). Fixes: 3039bcc744980 ("KVM: Move x86's MMU notifier memslot walkers to generic code") Signed-off-by: Hou Wenlong Message-Id: <5e16546e228877a4d974f8c0e448a93d52c7a5a9.1637140154.git.houwenlong93@linux.alibaba.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3be9beea838d..0a8436ea0090 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 377a96718a2e..1f8c9f783b78 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1032,8 +1032,8 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, struct kvm_mmu_page *root; for_each_tdp_mmu_root(kvm, root, range->slot->as_id) - flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); + flush = zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); return flush; } -- cgit v1.2.3 From 8ed716ca7dc91f058be0ba644a3048667a20db13 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Nov 2021 17:20:40 +0800 Subject: KVM: x86/mmu: Pass parameter flush as false in kvm_tdp_mmu_zap_collapsible_sptes() Since tlb flush has been done for legacy MMU before kvm_tdp_mmu_zap_collapsible_sptes(), so the parameter flush should be false for kvm_tdp_mmu_zap_collapsible_sptes(). Fixes: e2209710ccc5d ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Hou Wenlong Message-Id: <21453a1d2533afb6e59fb6c729af89e771ff2e76.1637140154.git.houwenlong93@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0a8436ea0090..0c839ee1282c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5854,7 +5854,7 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush = false; + bool flush; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -5871,7 +5871,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, false); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); read_unlock(&kvm->mmu_lock); -- cgit v1.2.3 From 756e1fc16505c31c9f86b602fcb8e2bc55c4b7e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 16:41:06 +0000 Subject: KVM: RISC-V: Unmap stage2 mapping when deleting/moving a memslot Unmap stage2 page tables when a memslot is being deleted or moved. It's the architectures' responsibility to ensure existing mappings are removed when kvm_arch_flush_shadow_memslot() returns. Fixes: 9d05c1fee837 ("RISC-V: KVM: Implement stage2 page table programming") Signed-off-by: Sean Christopherson Signed-off-by: Anup Patel --- arch/riscv/kvm/mmu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index d81bae8eb55e..fc058ff5f4b6 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -453,6 +453,12 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_unmap_range(kvm, gpa, size, false); + spin_unlock(&kvm->mmu_lock); } void kvm_arch_commit_memory_region(struct kvm *kvm, -- cgit v1.2.3 From 74c2e97b01846eb237b7819a3e2944455cfdb26a Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 17 Nov 2021 10:30:29 +0530 Subject: RISC-V: KVM: Fix incorrect KVM_MAX_VCPUS value The KVM_MAX_VCPUS value is supposed to be aligned with number of VMID bits in the hgatp CSR but the current KVM_MAX_VCPUS value is aligned with number of ASID bits in the satp CSR. Fixes: 99cdc6c18c2d ("RISC-V: Add initial skeletal KVM support") Signed-off-by: Anup Patel Reviewed-by: Atish Patra --- arch/riscv/include/asm/kvm_host.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 25ba21f98504..2639b9ee48f9 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -12,14 +12,12 @@ #include #include #include +#include #include #include -#ifdef CONFIG_64BIT -#define KVM_MAX_VCPUS (1U << 16) -#else -#define KVM_MAX_VCPUS (1U << 9) -#endif +#define KVM_MAX_VCPUS \ + ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1) #define KVM_HALT_POLL_NS_DEFAULT 500000 -- cgit v1.2.3 From 83bb2c1a01d7127d5adc7d69d7aaa3f7072de2b4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 10:20:06 +0000 Subject: KVM: arm64: Save PSTATE early on exit In order to be able to use primitives such as vcpu_mode_is_32bit(), we need to synchronize the guest PSTATE. However, this is currently done deep into the bowels of the world-switch code, and we do have helpers evaluating this much earlier (__vgic_v3_perform_cpuif_access and handle_aarch32_guest, for example). Move the saving of the guest pstate into the early fixups, which cures the first issue. The second one will be addressed separately. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 6 ++++++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 7a0af1d39303..d79fd101615f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -429,6 +429,12 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) */ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + /* + * Save PSTATE early so that we can evaluate the vcpu mode + * early on. + */ + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index de7e14c862e6..7ecca8b07851 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { ctxt->regs.pc = read_sysreg_el2(SYS_ELR); - ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Guest PSTATE gets saved at guest fixup time in all + * cases. We still need to handle the nVHE host side here. + */ + if (!has_vhe() && ctxt->__hyp_running_vcpu) + ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); -- cgit v1.2.3 From 7183b2b5ae6b8d77a37069566d77cf2a74060f7e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 12:39:35 +0000 Subject: KVM: arm64: Move pkvm's special 32bit handling into a generic infrastructure Protected KVM is trying to turn AArch32 exceptions into an illegal exception entry. Unfortunately, it does that in a way that is a bit abrupt, and too early for PSTATE to be available. Instead, move it to the fixup code, which is a more reasonable place for it. This will also be useful for the NV code. Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 8 ++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 8 +------- arch/arm64/kvm/hyp/vhe/switch.c | 4 ++++ 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d79fd101615f..96c5f3fb7838 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -403,6 +403,8 @@ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); + /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * @@ -435,6 +437,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) */ vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Check whether we want to repaint the state one way or + * another. + */ + early_exit_filter(vcpu, exit_code); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c0e3fed26d93..d13115a12434 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -233,7 +233,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) * Returns false if the guest ran in AArch32 when it shouldn't have, and * thus should exit to the host, or true if a the guest run loop can continue. */ -static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); @@ -248,10 +248,7 @@ static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) vcpu->arch.target = -1; *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; - return false; } - - return true; } /* Switch to the guest for legacy non-VHE systems */ @@ -316,9 +313,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* Jump in the fire! */ exit_code = __guest_enter(vcpu); - if (unlikely(!handle_aarch32_guest(vcpu, &exit_code))) - break; - /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 5a2cb5d9bc4b..fbb26b93c347 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -112,6 +112,10 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 1f80d15020d7f130194821feb1432b67648c632d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 25 Nov 2021 15:20:14 +0000 Subject: KVM: arm64: Avoid setting the upper 32 bits of TCR_EL2 and CPTR_EL2 to 1 Having a signed (1 << 31) constant for TCR_EL2_RES1 and CPTR_EL2_TCPAC causes the upper 32-bit to be set to 1 when assigning them to a 64-bit variable. Bit 32 in TCR_EL2 is no longer RES0 in ARMv8.7: with FEAT_LPA2 it changes the meaning of bits 49:48 and 9:8 in the stage 1 EL2 page table entries. As a result of the sign-extension, a non-VHE kernel can no longer boot on a model with ARMv8.7 enabled. CPTR_EL2 still has the top 32 bits RES0 but we should preempt any future problems Make these top bit constants unsigned as per commit df655b75c43f ("arm64: KVM: Avoid setting the upper 32 bits of VTCR_EL2 to 1"). Signed-off-by: Catalin Marinas Reported-by: Chris January Cc: Cc: Will Deacon Cc: Marc Zyngier Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211125152014.2806582-1-catalin.marinas@arm.com --- arch/arm64/include/asm/kvm_arm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a39fcf318c77..01d47c5886dc 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -91,7 +91,7 @@ #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) /* TCR_EL2 Registers bits */ -#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) +#define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) @@ -276,7 +276,7 @@ #define CPTR_EL2_TFP_SHIFT 10 /* Hyp Coprocessor Trap Register */ -#define CPTR_EL2_TCPAC (1 << 31) +#define CPTR_EL2_TCPAC (1U << 31) #define CPTR_EL2_TAM (1 << 30) #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) -- cgit v1.2.3 From 8503fea6761de32b72585001ac94e5f81ce8ca44 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 18:20:16 -0500 Subject: KVM: VMX: do not use uninitialized gfn_to_hva_cache An uninitialized gfn_to_hva_cache has ghc->len == 0, which causes the accessors to croak very loudly. While a BUG_ON is definitely _too_ loud and a bug on its own, there is indeed an issue of using the caches in such a way that they could not have been initialized, because ghc->gpa == 0 might match and thus kvm_gfn_to_hva_cache_init would not be called. For the vmcs12_cache, the solution is simply to invoke kvm_gfn_to_hva_cache_init unconditionally: we already know that the cache does not match the current VMCS pointer. For the shadow_vmcs12_cache, there is no similar condition that checks the VMCS link pointer, so invalidate the cache on VMXON. Fixes: cee66664dcd6 ("KVM: nVMX: Use a gfn_to_hva_cache for vmptrld") Acked-by: David Woodhouse Reported-by: syzbot+7b7db8bb4db6fd5e157b@syzkaller.appspotmail.com Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1e2f66951566..315fa456d368 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4857,6 +4857,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5289,8 +5290,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; struct vmcs_hdr hdr; - if (ghc->gpa != vmptr && - kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the -- cgit v1.2.3 From 78311a514099932cd8434d5d2194aa94e56ab67c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Nov 2021 07:35:44 -0500 Subject: KVM: x86: ignore APICv if LAPIC is not enabled Synchronize the two calls to kvm_x86_sync_pir_to_irr. The one in the reenter-guest fast path invoked the callback unconditionally even if LAPIC is present but disabled. In this case, there are no interrupts to deliver, and therefore posted interrupts can be ignored. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a403d92833f..441f4769173e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9849,7 +9849,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (vcpu->arch.apicv_active) + if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { -- cgit v1.2.3 From 30d7c5d60a886e3c89633ccf0ea4865276a759fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Nov 2021 04:41:34 -0500 Subject: KVM: SEV: expose KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM capability The capability, albeit present, was never exposed via KVM_CHECK_EXTENSION. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Cc: Peter Gonda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 441f4769173e..30c4d72bf717 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4133,6 +4133,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: -- cgit v1.2.3 From 2b4a5a5d56881ece3c66b9a9a8943a6f41bd7349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:43 +0000 Subject: KVM: nVMX: Flush current VPID (L1 vs. L2) for KVM_REQ_TLB_FLUSH_GUEST Flush the current VPID when handling KVM_REQ_TLB_FLUSH_GUEST instead of always flushing vpid01. Any TLB flush that is triggered when L2 is active is scoped to L2's VPID (if it has one), e.g. if L2 toggles CR4.PGE and L1 doesn't intercept PGE writes, then KVM's emulation of the TLB flush needs to be applied to L2's VPID. Reported-by: Lai Jiangshan Fixes: 07ffaf343e34 ("KVM: nVMX: Sync all PGDs on nested transition with shadow paging") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..18971cfadd4f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2918,6 +2918,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2930,31 +2937,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 40e5f9080472b614eeedcc5ba678289cd98d70df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:43 +0000 Subject: KVM: nVMX: Abide to KVM_REQ_TLB_FLUSH_GUEST request on nested vmentry/vmexit Like KVM_REQ_TLB_FLUSH_CURRENT, the GUEST variant needs to be serviced at nested transitions, as KVM doesn't track requests for L1 vs L2. E.g. if there's a pending flush when a nested VM-Exit occurs, then the flush was requested in the context of L2 and needs to be handled before switching to L1, otherwise the flush for L2 would effectiely be lost. Opportunistically add a helper to handle CURRENT and GUEST as a pair, the logic for when they need to be serviced is identical as both requests are tied to L1 vs. L2, the only difference is the scope of the flush. Reported-by: Lai Jiangshan Fixes: 07ffaf343e34 ("KVM: nVMX: Sync all PGDs on nested transition with shadow paging") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++----- arch/x86/kvm/x86.c | 28 ++++++++++++++++++++++++---- arch/x86/kvm/x86.h | 7 +------ 3 files changed, 28 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 315fa456d368..8e55aaef33ee 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3344,8 +3344,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -4502,9 +4501,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, (void)nested_get_evmcs_page(vcpu); } - /* Service the TLB flush request for L2 before switching to L1. */ - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30c4d72bf717..028151c309c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3258,6 +3258,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_tlb_flush_guest)(vcpu); } + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + static_call(kvm_x86_tlb_flush_current)(vcpu); +} + +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); + static void record_steal_time(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; @@ -9649,10 +9672,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) - kvm_vcpu_flush_tlb_guest(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 997669ae9caa..4abcd8d9836d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) @@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); -} - static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -- cgit v1.2.3 From 712494de96f35f3e146b36b752c2afe0fdc0f0cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:44 +0000 Subject: KVM: nVMX: Emulate guest TLB flush on nested VM-Enter with new vpid12 Fully emulate a guest TLB flush on nested VM-Enter which changes vpid12, i.e. L2's VPID, instead of simply doing INVVPID to flush real hardware's TLB entries for vpid02. From L1's perspective, changing L2's VPID is effectively a TLB flush unless "hardware" has previously cached entries for the new vpid12. Because KVM tracks only a single vpid12, KVM doesn't know if the new vpid12 has been used in the past and so must treat it as a brand new, never been used VPID, i.e. must assume that the new vpid12 represents a TLB flush from L1's perspective. For example, if L1 and L2 share a CR3, the first VM-Enter to L2 (with a VPID) is effectively a TLB flush as hardware/KVM has never seen vpid12 and thus can't have cached entries in the TLB for vpid12. Reported-by: Lai Jiangshan Fixes: 5c614b3583e7 ("KVM: nVMX: nested VPID emulation") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8e55aaef33ee..64f2828035c2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, WARN_ON(!enable_vpid); /* - * If VPID is enabled and used by vmc12, but L2 does not have a unique - * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate - * a VPID for L2, flush the current context as the effective ASID is - * common to both L1 and L2. - * - * Defer the flush so that it runs after vmcs02.EPTP has been set by - * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid - * redundant flushes further down the nested pipeline. - * - * If a TLB flush isn't required due to any of the above, and vpid12 is - * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be flushed. There's no direct - * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate - * guest-physical mappings, so there is no need to sync the nEPT MMU. + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. */ - if (!nested_has_guest_tlb_tag(vcpu)) { - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } else if (is_vmenter && - vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - vpid_sync_context(nested_get_vpid02(vcpu)); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) -- cgit v1.2.3 From feb627e8d6f69c9a319fe279710959efb3eba873 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 22 Nov 2021 18:58:18 +0100 Subject: KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN Commit 63f5a1909f9e ("KVM: x86: Alert userspace that KVM_SET_CPUID{,2} after KVM_RUN is broken") officially deprecated KVM_SET_CPUID{,2} ioctls after first successful KVM_RUN and promissed to make this sequence forbiden in 5.16. It's time to fulfil the promise. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211122175818.608220-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++----------------- arch/x86/kvm/x86.c | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c839ee1282c..0c44581721b0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5025,6 +5025,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. + * + * Correctly handling multiple vCPU models with respect to paging and + * physical address properties) in a single VM would require tracking + * all relevant CPUID information in kvm_mmu_page_role. That is very + * undesirable as it would increase the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments). For now that + * problem is swept under the rug; KVM's CPUID API is horrific and + * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.mmu_role.ext.valid = 0; vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; @@ -5032,24 +5040,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise - * sweep the problem under the rug. - * - * KVM's horrific CPUID ABI makes the problem all but impossible to - * solve, as correctly handling multiple vCPU models (with respect to - * paging and physical address properties) in a single VM would require - * tracking all relevant CPUID information in kvm_mmu_page_role. That - * is very undesirable as it would double the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments), and in practice - * no sane VMM mucks with the core vCPU model on the fly. + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in + * kvm_arch_vcpu_ioctl(). */ - if (vcpu->arch.last_vmentry_cpu != -1) { - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); - } + KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 028151c309c9..817898eab7c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5148,6 +5148,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly, so fail. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5158,6 +5169,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; + /* + * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in + * KVM_SET_CPUID case above. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; -- cgit v1.2.3 From 12ec33a705749e18d9588b0a0e69e02821371156 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:43 +0800 Subject: KVM: X86: Fix when shadow_root_level=5 && guest root_level<4 If the is an L1 with nNPT in 32bit, the shadow walk starts with pae_root. Fixes: a717a780fc4e ("KVM: x86/mmu: Support shadowing NPT when 5-level paging is enabled in host) Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c44581721b0..d7ae369ec8c2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2173,10 +2173,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->shadow_root_level; - if (iterator->level == PT64_ROOT_4LEVEL && + if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->direct_map) - --iterator->level; + iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* -- cgit v1.2.3 From 05b29633c7a956d5675f5fbba70db0d26aa5e73e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:46 +0800 Subject: KVM: X86: Use vcpu->arch.walk_mmu for kvm_mmu_invlpg() INVLPG operates on guest virtual address, which are represented by vcpu->arch.walk_mmu. In nested virtualization scenarios, kvm_mmu_invlpg() was using the wrong MMU structure; if L2's invlpg were emulated by L0 (in practice, it hardly happen) when nested two-dimensional paging is enabled, the call to ->tlb_flush_gva() would be skipped and the hardware TLB entry would not be invalidated. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-5-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d7ae369ec8c2..5942e9c6dd6e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5363,7 +5363,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); -- cgit v1.2.3 From 7533377215b6ee432c06c5855f6be5d66e694e46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 01:50:08 +0000 Subject: KVM: x86/mmu: Use yield-safe TDP MMU root iter in MMU notifier unmapping Use the yield-safe variant of the TDP MMU iterator when handling an unmapping event from the MMU notifier, as most occurences of the event allow yielding. Fixes: e1eed5847b09 ("KVM: x86/mmu: Allow yielding during MMU notifier unmap/zap, if possible") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211120015008.3780032-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1f8c9f783b78..4cd6bf7e73f0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1031,7 +1031,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) flush = zap_gfn_range(kvm, root, range->start, range->end, range->may_block, flush, false); -- cgit v1.2.3 From 4b85c921cd393764d22c0cdab6d7d5d120aa0980 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 04:50:21 +0000 Subject: KVM: x86/mmu: Remove spurious TLB flushes in TDP MMU zap collapsible path Drop the "flush" param and return values to/from the TDP MMU's helper for zapping collapsible SPTEs. Because the helper runs with mmu_lock held for read, not write, it uses tdp_mmu_zap_spte_atomic(), and the atomic zap handles the necessary remote TLB flush. Similarly, because mmu_lock is dropped and re-acquired between zapping legacy MMUs and zapping TDP MMUs, kvm_mmu_zap_collapsible_sptes() must handle remote TLB flushes from the legacy MMU before calling into the TDP MMU. Fixes: e2209710ccc5d ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Sean Christopherson Message-Id: <20211120045046.3940942-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++------- arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++--------------- arch/x86/kvm/mmu/tdp_mmu.h | 5 ++--- 3 files changed, 11 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5942e9c6dd6e..1b3a7cc9d595 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5848,8 +5848,6 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* @@ -5857,17 +5855,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, * logging at a 4k granularity and never creates collapsible * 2m SPTEs during dirty logging. */ - flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (flush) + if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, false); - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4cd6bf7e73f0..1db8496259ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1362,10 +1362,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static bool zap_collapsible_spte_range(struct kvm *kvm, +static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot, - bool flush) + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; @@ -1376,10 +1375,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { - flush = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1391,6 +1388,7 @@ retry: pfn, PG_LEVEL_NUM)) continue; + /* Note, a successful atomic zap also does a remote TLB flush. */ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* * The iter must explicitly re-read the SPTE because @@ -1399,30 +1397,24 @@ retry: iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); goto retry; } - flush = true; } rcu_read_unlock(); - - return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush) +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - flush = zap_collapsible_spte_range(kvm, root, slot, flush); - - return flush; + zap_collapsible_spte_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 476b133544dd..3899004a5d91 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, -- cgit v1.2.3 From 28f091bc2f8c23b7eac2402956b692621be7f9f4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 13:01:37 -0500 Subject: KVM: MMU: shadow nested paging does not have PKU Initialize the mask for PKU permissions as if CR4.PKE=0, avoiding incorrect interpretations of the nested hypervisor's page tables. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1b3a7cc9d595..0e017a3b7c27 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4855,7 +4855,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, - .cr4 = cr4, + .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_mmu_role new_role; @@ -4919,7 +4919,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(context); + context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } -- cgit v1.2.3 From f47491d7f30b8ab084d0b1596697a7ea4561a894 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 01:57:06 +0000 Subject: KVM: x86/mmu: Handle "default" period when selectively waking kthread Account for the '0' being a default, "let KVM choose" period, when determining whether or not the recovery worker needs to be awakened in response to userspace reducing the period. Failure to do so results in the worker not being awakened properly, e.g. when changing the period from '0' to any small-ish value. Fixes: 4dfe4f40d845 ("kvm: x86: mmu: Make NX huge page recovery period configurable") Cc: stable@vger.kernel.org Cc: Junaid Shahid Signed-off-by: Sean Christopherson Message-Id: <20211120015706.3830341-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0e017a3b7c27..6354297e92ae 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6171,23 +6171,46 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } +/* + * Calculate the effective recovery period, accounting for '0' meaning "let KVM + * select a halving time of 1 hour". Returns true if recovery is enabled. + */ +static bool calc_nx_huge_pages_recovery_period(uint *period) +{ + /* + * Use READ_ONCE to get the params, this may be called outside of the + * param setters, e.g. by the kthread to compute its next timeout. + */ + bool enabled = READ_ONCE(nx_huge_pages); + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + if (!enabled || !ratio) + return false; + + *period = READ_ONCE(nx_huge_pages_recovery_period_ms); + if (!*period) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + *period = 60 * 60 * 1000 / ratio; + } + return true; +} + static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; - was_recovery_enabled = nx_huge_pages_recovery_ratio; - old_period = nx_huge_pages_recovery_period_ms; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; - is_recovery_enabled = nx_huge_pages_recovery_ratio; - new_period = nx_huge_pages_recovery_period_ms; + is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); - if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; @@ -6251,18 +6274,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + bool enabled; + uint period; - if (!period && ratio) { - /* Make sure the period is not less than one second. */ - ratio = min(ratio, 3600u); - period = 60 * 60 * 1000 / ratio; - } + enabled = calc_nx_huge_pages_recovery_period(&period); - return READ_ONCE(nx_huge_pages) && ratio - ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) -- cgit v1.2.3 From 7e1901f6c86c896acff6609e0176f93f756d8b2a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:09 -0500 Subject: KVM: VMX: prepare sync_pir_to_irr for running with APICv disabled If APICv is disabled for this vCPU, assigned devices may still attempt to post interrupts. In that case, we need to cancel the vmentry and deliver the interrupt with KVM_REQ_EVENT. Extend the existing code that handles injection of L1 interrupts into L2 to cover this case as well. vmx_hwapic_irr_update is only called when APICv is active so it would be confusing to add a check for vcpu->arch.apicv_active in there. Instead, just use vmx_set_rvi directly in vmx_sync_pir_to_irr. Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Reviewed-by: Sean Christopherson Message-Id: <20211123004311.2954158-3-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 18971cfadd4f..1fadec8cbf96 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6267,9 +6267,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6279,22 +6279,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, this may cause a vmexit or it may - * be injected into L2. Either way, this interrupt will be - * processed via KVM_REQ_EVENT, not RVI, because we do not use - * virtual interrupt delivery to inject L1 interrupts into L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } -- cgit v1.2.3 From 37c4dbf337c5c2cdb24365ffae6ed70ac1e74d7a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:10 -0500 Subject: KVM: x86: check PIR even for vCPUs with disabled APICv The IRTE for an assigned device can trigger a POSTED_INTR_VECTOR even if APICv is disabled on the vCPU that receives it. In that case, the interrupt will just cause a vmexit and leave the ON bit set together with the PIR bit corresponding to the interrupt. Right now, the interrupt would not be delivered until APICv is re-enabled. However, fixing this is just a matter of always doing the PIR->IRR synchronization, even if the vCPU has temporarily disabled APICv. This is not a problem for performance, or if anything it is an improvement. First, in the common case where vcpu->arch.apicv_active is true, one fewer check has to be performed. Second, static_call_cond will elide the function call if APICv is not present or disabled. Finally, in the case for AMD hardware we can remove the sync_pir_to_irr callback: it is only needed for apic_has_interrupt_for_ppr, and that function already has a fallback for !APICv. Cc: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Message-Id: <20211123004311.2954158-4-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/x86.c | 18 +++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 759952dd1222..f206fc35deff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (apic->vcpu->arch.apicv_active) + if (kvm_x86_ops.sync_pir_to_irr) highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5630c241d5f6..d0f68d11ec70 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4651,7 +4651,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, - .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 817898eab7c3..0ee1a039b490 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4472,8 +4472,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -9571,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -9842,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * notified with kvm_vcpu_kick. Assigned devices can + * use the POSTED_INTR_VECTOR even if APICv is disabled, + * so do it even if APICv is disabled on this vCPU. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -9889,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; -- cgit v1.2.3 From 53b7ca1a359389276c76fbc9e1009d8626a17e40 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:11 -0500 Subject: KVM: x86: Use a stable condition around all VT-d PI paths Currently, checks for whether VT-d PI can be used refer to the current status of the feature in the current vCPU; or they more or less pick vCPU 0 in case a specific vCPU is not available. However, these checks do not attempt to synchronize with changes to the IRTE. In particular, there is no path that updates the IRTE when APICv is re-activated on vCPU 0; and there is no path to wakeup a CPU that has APICv disabled, if the wakeup occurs because of an IRTE that points to a posted interrupt. To fix this, always go through the VT-d PI path as long as there are assigned devices and APICv is available on both the host and the VM side. Since the relevant condition was copied over three times, take the hint and factor it into a separate function. Suggested-by: Sean Christopherson Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Message-Id: <20211123004311.2954158-5-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f81ef092bd4..1c94783b5a54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -5,6 +5,7 @@ #include #include "lapic.h" +#include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" @@ -77,13 +78,18 @@ after_clear_sn: pi_set_on(pi_desc); } +static bool vmx_can_use_vtd_pi(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; /* Set SN when the vCPU is preempted */ @@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return 0; WARN_ON(irqs_disabled()); @@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct vcpu_data vcpu_info; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + if (!vmx_can_use_vtd_pi(kvm)) return 0; idx = srcu_read_lock(&kvm->irq_srcu); -- cgit v1.2.3 From 4674164f0ac5fd553c38b2b8c49fe13297fed38b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:28 -0500 Subject: KVM: SEV: do not use list_replace_init on an empty list list_replace_init cannot be used if the source is an empty list, because "new->next->prev = new" will overwrite "old->next": new old prev = new, next = new prev = old, next = old new->next = old->next prev = new, next = old prev = old, next = old new->next->prev = new prev = new, next = old prev = old, next = new new->prev = old->prev prev = old, next = old prev = old, next = old new->next->prev = new prev = old, next = old prev = new, next = new The desired outcome instead would be to leave both old and new the same as they were (two empty circular lists). Use list_cut_before, which already has the necessary check and is documented to discard the previous contents of the list that will hold the result. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-5-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 21ac0a5de4e0..75955beb3770 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1613,8 +1613,7 @@ static void sev_migrate_from(struct kvm_sev_info *dst, src->handle = 0; src->pages_locked = 0; - INIT_LIST_HEAD(&dst->regions_list); - list_replace_init(&src->regions_list, &dst->regions_list); + list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) -- cgit v1.2.3 From 501b580c02339a83917cf3b44c445f2419b15dcb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:29 -0500 Subject: KVM: SEV: cleanup locking for KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM Encapsulate the handling of the migration_in_progress flag for both VMs in two functions sev_lock_two_vms and sev_unlock_two_vms. It does not matter if KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM locks the destination struct kvm a bit later, and this change 1) keeps the cleanup chain of labels smaller 2) makes it possible for KVM_CAP_VM_COPY_ENC_CONTEXT_FROM to reuse the logic. Cc: Peter Gonda Cc: Sean Christopherson Message-Id: <20211123005036.2954379-6-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 53 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75955beb3770..8902b018fc18 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1543,28 +1543,40 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) return false; } -static int sev_lock_for_migration(struct kvm *kvm) +static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + + if (dst_kvm == src_kvm) + return -EINVAL; /* - * Bail if this VM is already involved in a migration to avoid deadlock - * between two VMs trying to migrate to/from each other. + * Bail if these VMs are already involved in a migration to avoid + * deadlock between two VMs trying to migrate to/from each other. */ - if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - mutex_lock(&kvm->lock); + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) { + atomic_set_release(&dst_sev->migration_in_progress, 0); + return -EBUSY; + } + mutex_lock(&dst_kvm->lock); + mutex_lock(&src_kvm->lock); return 0; } -static void sev_unlock_after_migration(struct kvm *kvm) +static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; - mutex_unlock(&kvm->lock); - atomic_set_release(&sev->migration_in_progress, 0); + mutex_unlock(&dst_kvm->lock); + mutex_unlock(&src_kvm->lock); + atomic_set_release(&dst_sev->migration_in_progress, 0); + atomic_set_release(&src_sev->migration_in_progress, 0); } @@ -1665,15 +1677,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) bool charged = false; int ret; - ret = sev_lock_for_migration(kvm); - if (ret) - return ret; - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto out_unlock; - } - source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; @@ -1681,13 +1684,13 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } source_kvm = source_kvm_file->private_data; - ret = sev_lock_for_migration(source_kvm); + ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; - if (!sev_guest(source_kvm)) { + if (sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; - goto out_source; + goto out_unlock; } src_sev = &to_kvm_svm(source_kvm)->sev_info; @@ -1727,13 +1730,11 @@ out_dst_cgroup: sev_misc_cg_uncharge(cg_cleanup_sev); put_misc_cg(cg_cleanup_sev->misc_cg); cg_cleanup_sev->misc_cg = NULL; -out_source: - sev_unlock_after_migration(source_kvm); +out_unlock: + sev_unlock_two_vms(kvm, source_kvm); out_fput: if (source_kvm_file) fput(source_kvm_file); -out_unlock: - sev_unlock_after_migration(kvm); return ret; } -- cgit v1.2.3 From 2b347a387811cb4aa7bcdb96e9203c5019a6fb41 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:30 -0500 Subject: KVM: SEV: initialize regions_list of a mirror VM This was broken before the introduction of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, but technically harmless because the region list was unused for a mirror VM. However, it is untidy and it now causes a NULL pointer access when attempting to move the encryption context of a mirror VM. Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context") Message-Id: <20211123005036.2954379-7-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8902b018fc18..8daabc3dc079 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2007,6 +2007,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->fd = source_sev.fd; mirror_sev->es_active = source_sev.es_active; mirror_sev->handle = source_sev.handle; + INIT_LIST_HEAD(&mirror_sev->regions_list); /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different -- cgit v1.2.3 From 642525e3bd474dc50b7d0e8ee9c966b97e4be3ac Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:31 -0500 Subject: KVM: SEV: move mirror status to destination of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM Allow intra-host migration of a mirror VM; the destination VM will be a mirror of the same ASID as the source. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-8-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8daabc3dc079..74b6459b5fb2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1619,11 +1619,13 @@ static void sev_migrate_from(struct kvm_sev_info *dst, dst->asid = src->asid; dst->handle = src->handle; dst->pages_locked = src->pages_locked; + dst->enc_context_owner = src->enc_context_owner; src->asid = 0; src->active = false; src->handle = 0; src->pages_locked = 0; + src->enc_context_owner = NULL; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } -- cgit v1.2.3 From bf42b02b19e27d6849852a41dd734af4c05e73c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:33 -0500 Subject: KVM: SEV: Do COPY_ENC_CONTEXT_FROM with both VMs locked Now that we have a facility to lock two VMs with deadlock protection, use it for the creation of mirror VMs as well. One of COPY_ENC_CONTEXT_FROM(dst, src) and COPY_ENC_CONTEXT_FROM(src, dst) would always fail, so the combination is nonsensical and it is okay to return -EBUSY if it is attempted. This sidesteps the question of what happens if a VM is MOVE_ENC_CONTEXT_FROM'd at the same time as it is COPY_ENC_CONTEXT_FROM'd: the locking prevents that from happening. Cc: Peter Gonda Cc: Sean Christopherson Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-10-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 66 ++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 42 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 74b6459b5fb2..025d9731b66c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1955,77 +1955,59 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info source_sev, *mirror_sev; + struct kvm_sev_info *source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; - goto e_source_put; + goto e_source_fput; } source_kvm = source_kvm_file->private_data; - mutex_lock(&source_kvm->lock); - - if (!sev_guest(source_kvm)) { - ret = -EINVAL; - goto e_source_unlock; - } + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto e_source_fput; - /* Mirrors of mirrors should work, but let's not get silly */ - if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { + /* + * Mirrors of mirrors should work, but let's not get silly. Also + * disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || !sev_guest(source_kvm) || + is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { ret = -EINVAL; - goto e_source_unlock; + goto e_unlock; } - memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, - sizeof(source_sev)); - /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ + source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - fput(source_kvm_file); - mutex_unlock(&source_kvm->lock); - mutex_lock(&kvm->lock); - - /* - * Disallow out-of-band SEV/SEV-ES init if the target is already an - * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being - * created after SEV/SEV-ES initialization, e.g. to init intercepts. - */ - if (sev_guest(kvm) || kvm->created_vcpus) { - ret = -EINVAL; - goto e_mirror_unlock; - } - /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; - mirror_sev->asid = source_sev.asid; - mirror_sev->fd = source_sev.fd; - mirror_sev->es_active = source_sev.es_active; - mirror_sev->handle = source_sev.handle; + mirror_sev->asid = source_sev->asid; + mirror_sev->fd = source_sev->fd; + mirror_sev->es_active = source_sev->es_active; + mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + ret = 0; + /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views. */ - mutex_unlock(&kvm->lock); - return 0; - -e_mirror_unlock: - mutex_unlock(&kvm->lock); - kvm_put_kvm(source_kvm); - return ret; -e_source_unlock: - mutex_unlock(&source_kvm->lock); -e_source_put: +e_unlock: + sev_unlock_two_vms(kvm, source_kvm); +e_source_fput: if (source_kvm_file) fput(source_kvm_file); return ret; -- cgit v1.2.3 From 17d44a96f000fe1040d4ba1c34e458c63be6b7ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:34 -0500 Subject: KVM: SEV: Prohibit migration of a VM that has mirrors VMs that mirror an encryption context rely on the owner to keep the ASID allocated. Performing a KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM would cause a dangling ASID: 1. copy context from A to B (gets ref to A) 2. move context from A to L (moves ASID from A to L) 3. close L (releases ASID from L, B still references it) The right way to do the handoff instead is to create a fresh mirror VM on the destination first: 1. copy context from A to B (gets ref to A) [later] 2. close B (releases ref to A) 3. move context from A to L (moves ASID from A to L) 4. copy context from L to M So, catch the situation by adding a count of how many VMs are mirroring this one's encryption context. Fixes: 0b020f5af092 ("KVM: SEV: Add support for SEV-ES intra host migration") Message-Id: <20211123005036.2954379-11-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++- arch/x86/kvm/svm/svm.h | 1 + .../selftests/kvm/x86_64/sev_migrate_tests.c | 37 ++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 025d9731b66c..89a716290fac 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1696,6 +1696,16 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } src_sev = &to_kvm_svm(source_kvm)->sev_info; + + /* + * VMs mirroring src's encryption context rely on it to keep the + * ASID allocated, but below we are clearing src_sev->asid. + */ + if (src_sev->num_mirrored_vms) { + ret = -EBUSY; + goto out_unlock; + } + dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1987,6 +1997,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); + source_sev->num_mirrored_vms++; /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; @@ -2019,12 +2030,21 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; + WARN_ON(sev->num_mirrored_vms); + if (!sev_guest(kvm)) return; /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { - kvm_put_kvm(sev->enc_context_owner); + struct kvm *owner_kvm = sev->enc_context_owner; + struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; + + mutex_lock(&owner_kvm->lock); + if (!WARN_ON(!owner_sev->num_mirrored_vms)) + owner_sev->num_mirrored_vms--; + mutex_unlock(&owner_kvm->lock); + kvm_put_kvm(owner_kvm); return; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5faad3dc10e2..1c7306c370fa 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,6 +79,7 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ + unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index d265cea5de85..29b18d565cf4 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -294,6 +294,41 @@ static void test_sev_mirror_parameters(void) kvm_vm_free(vm_no_vcpu); } +static void test_sev_move_copy(void) +{ + struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm; + int ret; + + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm->fd, sev_vm->fd); + ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); + TEST_ASSERT(ret == -1 && errno == EBUSY, + "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, + errno); + + /* The mirror itself can be migrated. */ + sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); + ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); + TEST_ASSERT(ret == -1 && errno == EBUSY, + "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, + errno); + + /* + * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus, + * the owner can be copied as soon as dst_mirror_vm is gone. + */ + kvm_vm_free(dst_mirror_vm); + sev_migrate_from(dst_vm->fd, sev_vm->fd); + + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); +} + int main(int argc, char *argv[]) { if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { @@ -301,6 +336,8 @@ int main(int argc, char *argv[]) test_sev_migrate_from(/* es= */ true); test_sev_migrate_locking(); test_sev_migrate_parameters(); + if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) + test_sev_move_copy(); } if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { test_sev_mirror(/* es= */ false); -- cgit v1.2.3 From 10a37929efeb4c51a0069afdd537c4fa3831f6e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:35 -0500 Subject: KVM: SEV: do not take kvm->lock when destroying Taking the lock is useless since there are no other references, and there are already accesses (e.g. to sev->enc_context_owner) that do not take it. So get rid of it. Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-12-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89a716290fac..bbbf980c7e40 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2048,8 +2048,6 @@ void sev_vm_destroy(struct kvm *kvm) return; } - mutex_lock(&kvm->lock); - /* * Ensure that all guest tagged cache entries are flushed before * releasing the pages back to the system for use. CLFLUSH will @@ -2069,8 +2067,6 @@ void sev_vm_destroy(struct kvm *kvm) } } - mutex_unlock(&kvm->lock); - sev_unbind_asid(kvm, sev->handle); sev_asid_free(sev); } -- cgit v1.2.3 From c9d61dcb0bc26a761dc84a87bd8a0d3b3c432f10 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:36 -0500 Subject: KVM: SEV: accept signals in sev_lock_two_vms Generally, kvm->lock is not taken for a long time, but sev_lock_two_vms is different: it takes vCPU locks inside, so userspace can hold it back just by calling a vCPU ioctl. Play it safe and use mutex_lock_killable. Message-Id: <20211123005036.2954379-13-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index bbbf980c7e40..59727a966f90 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1547,6 +1547,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + int r = -EBUSY; if (dst_kvm == src_kvm) return -EINVAL; @@ -1558,14 +1559,23 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) { - atomic_set_release(&dst_sev->migration_in_progress, 0); - return -EBUSY; - } + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) + goto release_dst; - mutex_lock(&dst_kvm->lock); - mutex_lock(&src_kvm->lock); + r = -EINTR; + if (mutex_lock_killable(&dst_kvm->lock)) + goto release_src; + if (mutex_lock_killable(&src_kvm->lock)) + goto unlock_dst; return 0; + +unlock_dst: + mutex_unlock(&dst_kvm->lock); +release_src: + atomic_set_release(&src_sev->migration_in_progress, 0); +release_dst: + atomic_set_release(&dst_sev->migration_in_progress, 0); + return r; } static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) -- cgit v1.2.3 From e90e51d5f01d2baae5dcce280866bbb96816e978 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Nov 2021 07:36:41 -0500 Subject: KVM: VMX: clear vmx_x86_ops.sync_pir_to_irr if APICv is disabled There is nothing to synchronize if APICv is disabled, since neither other vCPUs nor assigned devices can set PIR.ON. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1fadec8cbf96..f90448809690 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7777,10 +7777,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; -- cgit v1.2.3 From 7cfc5c653b07782e7059527df8dc1e3143a7591e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Nov 2021 03:46:07 -0500 Subject: KVM: fix avic_set_running for preemptable kernels avic_set_running() passes the current CPU to avic_vcpu_load(), albeit via vcpu->cpu rather than smp_processor_id(). If the thread is migrated while avic_set_running runs, the call to avic_vcpu_load() can use a stale value for the processor id. Avoid this by blocking preemption over the entire execution of avic_set_running(). Reported-by: Sean Christopherson Fixes: 8221c1370056 ("svm: Manage vcpu load/unload when enable AVIC") Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index affc0ea98d30..9d6066eb7c10 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -989,16 +989,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { struct vcpu_svm *svm = to_svm(vcpu); + int cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); svm->avic_is_running = is_run; - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) { + if (is_run) + avic_vcpu_load(vcpu, cpu); + else + avic_vcpu_put(vcpu); + } + put_cpu(); } void svm_vcpu_blocking(struct kvm_vcpu *vcpu) -- cgit v1.2.3