From 3579dc742f76207a4854a87a8e3ce44434d7e308 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 28 Aug 2023 15:46:49 +0100 Subject: KVM: arm64: Properly return allocated EL2 VA from hyp_alloc_private_va_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marek reports that his RPi4 spits out a warning at boot time, right at the point where the GICv2 virtual CPU interface gets mapped. Upon investigation, it seems that we never return the allocated VA and use whatever was on the stack at this point. Yes, this is good stuff, and Marek was pretty lucky that he ended-up with a VA that intersected with something that was already mapped. On my setup, this random value is plausible enough for the mapping to take place. Who knows what happens... Fixes: f156a7d13fc3 ("KVM: arm64: Remove size-order align in the nVHE hyp private VA range") Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Vincent Donnefort Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/79b0ad6e-0c2a-f777-d504-e40e8123d81d@samsung.com Link: https://lore.kernel.org/r/20230828153121.4179627-1-maz@kernel.org --- arch/arm64/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 587a104f66c3..482280fe22d7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -652,6 +652,9 @@ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) mutex_unlock(&kvm_hyp_pgd_mutex); + if (!ret) + *haddr = base; + return ret; } -- cgit v1.2.3 From 373beef00f7d781a000b12c31fb17a5a9c25969c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 11 Sep 2023 15:52:57 +0100 Subject: KVM: arm64: nvhe: Ignore SVE hint in SMCCC function ID When SVE is enabled, the host may set bit 16 in SMCCC function IDs, a hint that indicates an unused SVE state. At the moment NVHE doesn't account for this bit when inspecting the function ID, and rejects most calls. Clear the hint bit before comparing function IDs. About version compatibility: the host's PSCI driver initially probes the firmware for a SMCCC version number. If the firmware implements a protocol recent enough (1.3), subsequent SMCCC calls have the hint bit set. Since the hint bit was reserved in earlier versions of the protocol, clearing it is fine regardless of the version in use. When a new hint is added to the protocol in the future, it will be added to ARM_SMCCC_CALL_HINTS and NVHE will handle it straight away. This patch only clears known hints and leaves reserved bits as is, because future SMCCC versions could use reserved bits as modifiers for the function ID, rather than hints. Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint") Reported-by: Ben Horgan Signed-off-by: Jean-Philippe Brucker Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230911145254.934414-4-jean-philippe@linaro.org --- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/hyp/include/nvhe/ffa.h | 2 +- arch/arm64/kvm/hyp/nvhe/ffa.c | 3 +-- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 ++++++-- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 3 +-- include/linux/arm-smccc.h | 2 ++ 7 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index b7238c72a04c..66efd67ea7e8 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -118,7 +118,7 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); u64 __guest_enter(struct kvm_vcpu *vcpu); -bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id); #ifdef __KVM_NVHE_HYPERVISOR__ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h index 1becb10ecd80..d9fd5e6c7d3c 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -12,6 +12,6 @@ #define FFA_MAX_FUNC_NUM 0x7F int hyp_ffa_init(void *pages); -bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id); #endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index ab4f5d160c58..6e4dba9eadef 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -634,9 +634,8 @@ out_handled: return true; } -bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - DECLARE_REG(u64, func_id, host_ctxt, 0); struct arm_smccc_res res; /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 90fade1b032e..1cc06e6797bd 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -57,6 +57,7 @@ __do_hyp_init: cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc + bic x0, x0, #ARM_SMCCC_CALL_HINTS mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) cmp x0, x3 b.eq 1f diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 857d9bc04fd4..2385fd03ed87 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -368,6 +368,7 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) if (static_branch_unlikely(&kvm_protected_mode_initialized)) hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + id &= ~ARM_SMCCC_CALL_HINTS; id -= KVM_HOST_SMCCC_ID(0); if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) @@ -392,11 +393,14 @@ static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { + DECLARE_REG(u64, func_id, host_ctxt, 0); bool handled; - handled = kvm_host_psci_handler(host_ctxt); + func_id &= ~ARM_SMCCC_CALL_HINTS; + + handled = kvm_host_psci_handler(host_ctxt, func_id); if (!handled) - handled = kvm_host_ffa_handler(host_ctxt); + handled = kvm_host_ffa_handler(host_ctxt, func_id); if (!handled) default_host_smc_handler(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 24543d2a3490..d57bcb6ab94d 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -273,9 +273,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ } } -bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt) +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - DECLARE_REG(u64, func_id, host_ctxt, 0); unsigned long ret; switch (kvm_host_psci_config.version) { diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 7c67c17321d4..083f85653716 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -67,6 +67,8 @@ #define ARM_SMCCC_VERSION_1_3 0x10003 #define ARM_SMCCC_1_3_SVE_HINT 0x10000 +#define ARM_SMCCC_CALL_HINTS ARM_SMCCC_1_3_SVE_HINT + #define ARM_SMCCC_VERSION_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -- cgit v1.2.3 From ef4d48368587f27cb1f690691518889d8fb3510b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Sep 2023 11:48:29 +0530 Subject: RISC-V: KVM: Fix KVM_GET_REG_LIST API for ISA_EXT registers The ISA_EXT registers to enabled/disable ISA extensions for VCPU are always available when underlying host has the corresponding ISA extension. The copy_isa_ext_reg_indices() called by the KVM_GET_REG_LIST API does not align with this expectation so let's fix it. Fixes: 031f9efafc08 ("KVM: riscv: Add KVM_GET_REG_LIST API support") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 1b7e9fa265cb..e7e833ced91b 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -842,7 +842,7 @@ static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu, u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i; isa_ext = kvm_isa_ext_arr[i]; - if (!__riscv_isa_extension_available(vcpu->arch.isa, isa_ext)) + if (!__riscv_isa_extension_available(NULL, isa_ext)) continue; if (uindices) { -- cgit v1.2.3 From 17f71a2a340f1dcd397a66110005722177d5927c Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Sep 2023 11:58:29 +0530 Subject: RISC-V: KVM: Fix riscv_vcpu_get_isa_ext_single() for missing extensions The riscv_vcpu_get_isa_ext_single() should fail with -ENOENT error when corresponding ISA extension is not available on the host. Fixes: e98b1085be79 ("RISC-V: KVM: Factor-out ONE_REG related code to its own source file") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index e7e833ced91b..b7e0e03c69b1 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -460,8 +460,11 @@ static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu, reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) return -ENOENT; - *reg_val = 0; host_isa_ext = kvm_isa_ext_arr[reg_num]; + if (!__riscv_isa_extension_available(NULL, host_isa_ext)) + return -ENOENT; + + *reg_val = 0; if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext)) *reg_val = 1; /* Mark the given extension as available */ -- cgit v1.2.3 From 50107e8b2a8a59d8cec7e8454e27c1f8e365acdb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Sep 2023 17:39:14 -0700 Subject: KVM: x86/mmu: Open code leaf invalidation from mmu_notifier The mmu_notifier path is a bit of a special snowflake, e.g. it zaps only a single address space (because it's per-slot), and can't always yield. Because of this, it calls kvm_tdp_mmu_zap_leafs() in ways that no one else does. Iterate manually over the leafs in response to an mmu_notifier invalidation, instead of invoking kvm_tdp_mmu_zap_leafs(). Drop the @can_yield param from kvm_tdp_mmu_zap_leafs() as its sole remaining caller unconditionally passes "true". Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20230916003916.2545000-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++++---- arch/x86/kvm/mmu/tdp_mmu.h | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e1d011c67cc6..59f5e40b8f55 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6260,7 +6260,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (tdp_mmu_enabled) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, - gfn_end, true, flush); + gfn_end, flush); } if (flush) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6c63f2d1675f..9c081591652b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -878,12 +878,12 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, - bool can_yield, bool flush) + bool flush) { struct kvm_mmu_page *root; for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); + flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; } @@ -1146,8 +1146,13 @@ retry: bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, - range->end, range->may_block, flush); + struct kvm_mmu_page *root; + + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id) + flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, + range->may_block, flush); + + return flush; } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 0a63b1afabd3..eb4fa345d3a4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,8 +20,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush); +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, + bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -- cgit v1.2.3 From 441a5dfcd96854cbcb625709e2694a9c60adfaab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Sep 2023 05:44:56 -0400 Subject: KVM: x86/mmu: Do not filter address spaces in for_each_tdp_mmu_root_yield_safe() All callers except the MMU notifier want to process all address spaces. Remove the address space ID argument of for_each_tdp_mmu_root_yield_safe() and switch the MMU notifier to use __for_each_tdp_mmu_root_yield_safe(). Extracted out of a patch by Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 ++------ arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++++++----------- arch/x86/kvm/mmu/tdp_mmu.h | 3 +-- 3 files changed, 14 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59f5e40b8f55..54f94f644b42 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6246,7 +6246,6 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; - int i; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; @@ -6257,11 +6256,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (tdp_mmu_enabled) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, - gfn_end, flush); - } + if (tdp_mmu_enabled) + flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9c081591652b..aa90901d2871 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -211,8 +211,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, false, false); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, false, false)) \ + if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) { \ + } else /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -877,12 +881,11 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or * more SPTEs were zapped since the MMU lock was last acquired. */ -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, - bool flush) +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -891,7 +894,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *root; - int i; /* * Zap all roots, including invalid roots, as all SPTEs must be dropped @@ -905,10 +907,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - for_each_tdp_mmu_root_yield_safe(kvm, root, i) - tdp_mmu_zap_root(kvm, root, false); - } + for_each_tdp_mmu_root_yield_safe(kvm, root) + tdp_mmu_zap_root(kvm, root, false); } /* @@ -1148,7 +1148,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id) + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, range->may_block, flush); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index eb4fa345d3a4..bc088953f929 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,8 +20,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, - bool flush); +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -- cgit v1.2.3 From 0df9dab891ff0d9b646d82e4fe038229e4c02451 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Sep 2023 17:39:15 -0700 Subject: KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously Stop zapping invalidate TDP MMU roots via work queue now that KVM preserves TDP MMU roots until they are explicitly invalidated. Zapping roots asynchronously was effectively a workaround to avoid stalling a vCPU for an extended during if a vCPU unloaded a root, which at the time happened whenever the guest toggled CR0.WP (a frequent operation for some guest kernels). While a clever hack, zapping roots via an unbound worker had subtle, unintended consequences on host scheduling, especially when zapping multiple roots, e.g. as part of a memslot. Because the work of zapping a root is no longer bound to the task that initiated the zap, things like the CPU affinity and priority of the original task get lost. Losing the affinity and priority can be especially problematic if unbound workqueues aren't affined to a small number of CPUs, as zapping multiple roots can cause KVM to heavily utilize the majority of CPUs in the system, *beyond* the CPUs KVM is already using to run vCPUs. When deleting a memslot via KVM_SET_USER_MEMORY_REGION, the async root zap can result in KVM occupying all logical CPUs for ~8ms, and result in high priority tasks not being scheduled in in a timely manner. In v5.15, which doesn't preserve unloaded roots, the issues were even more noticeable as KVM would zap roots more frequently and could occupy all CPUs for 50ms+. Consuming all CPUs for an extended duration can lead to significant jitter throughout the system, e.g. on ChromeOS with virtio-gpu, deleting memslots is a semi-frequent operation as memslots are deleted and recreated with different host virtual addresses to react to host GPU drivers allocating and freeing GPU blobs. On ChromeOS, the jitter manifests as audio blips during games due to the audio server's tasks not getting scheduled in promptly, despite the tasks having a high realtime priority. Deleting memslots isn't exactly a fast path and should be avoided when possible, and ChromeOS is working towards utilizing MAP_FIXED to avoid the memslot shenanigans, but KVM is squarely in the wrong. Not to mention that removing the async zapping eliminates a non-trivial amount of complexity. Note, one of the subtle behaviors hidden behind the async zapping is that KVM would zap invalidated roots only once (ignoring partial zaps from things like mmu_notifier events). Preserve this behavior by adding a flag to identify roots that are scheduled to be zapped versus roots that have already been zapped but not yet freed. Add a comment calling out why kvm_tdp_mmu_invalidate_all_roots() can encounter invalid roots, as it's not at all obvious why zapping invalidated roots shouldn't simply zap all invalid roots. Reported-by: Pattara Teerapong Cc: David Stevens Cc: Yiwei Zhang Cc: Paul Hsia Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20230916003916.2545000-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/kvm/mmu/mmu.c | 13 +--- arch/x86/kvm/mmu/mmu_internal.h | 15 +++-- arch/x86/kvm/mmu/tdp_mmu.c | 133 +++++++++++++++++----------------------- arch/x86/kvm/mmu/tdp_mmu.h | 2 +- arch/x86/kvm/x86.c | 5 +- 6 files changed, 68 insertions(+), 103 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1a4def36d5bb..17715cb8731d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1419,7 +1419,6 @@ struct kvm_arch { * the thread holds the MMU lock in write mode. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1835,7 +1834,7 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 54f94f644b42..f7901cb4d2fa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6167,20 +6167,15 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -int kvm_mmu_init_vm(struct kvm *kvm) +void kvm_mmu_init_vm(struct kvm *kvm) { - int r; - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (tdp_mmu_enabled) { - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; - } + if (tdp_mmu_enabled) + kvm_mmu_init_tdp_mmu(kvm); kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; @@ -6189,8 +6184,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; - - return 0; } static void mmu_free_vm_memory_caches(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b102014e2c60..decc1f153669 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -58,7 +58,12 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; - u8 mmu_valid_gen; + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; /* * The shadow page can't be replaced by an equivalent huge page @@ -100,13 +105,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; }; - union { - DECLARE_BITMAP(unsync_child_bitmap, 512); - struct { - struct work_struct tdp_mmu_async_work; - void *tdp_mmu_async_data; - }; - }; + DECLARE_BITMAP(unsync_child_bitmap, 512); /* * Tracks shadow pages that, if zapped, would allow KVM to create an NX diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index aa90901d2871..6cd4dd631a2f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -12,18 +12,10 @@ #include /* Initializes the TDP MMU for the VM, if enabled. */ -int kvm_mmu_init_tdp_mmu(struct kvm *kvm) +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - struct workqueue_struct *wq; - - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - if (!wq) - return -ENOMEM; - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - kvm->arch.tdp_mmu_zap_wq = wq; - return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ @@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) * ultimately frees all roots. */ kvm_tdp_mmu_invalidate_all_roots(kvm); - - /* - * Destroying a workqueue also first flushes the workqueue, i.e. no - * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). - */ - destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + kvm_tdp_mmu_zap_invalidated_roots(kvm); WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages - * can run before the VM is torn down. Work items on tdp_mmu_zap_wq - * can call kvm_tdp_mmu_put_root and create new callbacks. + * can run before the VM is torn down. Putting the last reference to + * zapped roots will create new callbacks. */ rcu_barrier(); } @@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); - -static void tdp_mmu_zap_root_work(struct work_struct *work) -{ - struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, - tdp_mmu_async_work); - struct kvm *kvm = root->tdp_mmu_async_data; - - read_lock(&kvm->mmu_lock); - - /* - * A TLB flush is not necessary as KVM performs a local TLB flush when - * allocating a new root (see kvm_mmu_load()), and when migrating vCPU - * to a different pCPU. Note, the local TLB flush on reuse also - * invalidates any paging-structure-cache entries, i.e. TLB entries for - * intermediate paging structures, that may be zapped, as such entries - * are associated with the ASID on both VMX and SVM. - */ - tdp_mmu_zap_root(kvm, root, true); - - /* - * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for - * avoiding an infinite loop. By design, the root is reachable while - * it's being asynchronously zapped, thus a different task can put its - * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an - * asynchronously zapped root is unavoidable. - */ - kvm_tdp_mmu_put_root(kvm, root, true); - - read_unlock(&kvm->mmu_lock); -} - -static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - root->tdp_mmu_async_data = kvm; - INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); - queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); -} - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -211,11 +158,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, false, false); \ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ _root; \ - _root = tdp_mmu_next_root(_kvm, _root, false, false)) \ - if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) { \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ + if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ } else /* @@ -296,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * by a memslot update or by the destruction of the VM. Initialize the * refcount to two; one reference for the vCPU, and one reference for * the TDP MMU itself, which is held until the root is invalidated and - * is ultimately put by tdp_mmu_zap_root_work(). + * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); @@ -885,7 +832,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_tdp_mmu_root_yield_safe(kvm, root, false) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -907,7 +854,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_tdp_mmu_root_yield_safe(kvm, root, false) tdp_mmu_zap_root(kvm, root, false); } @@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + struct kvm_mmu_page *root; + + read_lock(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, true) { + if (!root->tdp_mmu_scheduled_root_to_zap) + continue; + + root->tdp_mmu_scheduled_root_to_zap = false; + KVM_BUG_ON(!root->role.invalid, kvm); + + /* + * A TLB flush is not necessary as KVM performs a local TLB + * flush when allocating a new root (see kvm_mmu_load()), and + * when migrating a vCPU to a different pCPU. Note, the local + * TLB flush on reuse also invalidates paging-structure-cache + * entries, i.e. TLB entries for intermediate paging structures, + * that may be zapped, as such entries are associated with the + * ASID on both VMX and SVM. + */ + tdp_mmu_zap_root(kvm, root, true); + + /* + * The referenced needs to be put *after* zapping the root, as + * the root must be reachable by mmu_notifiers while it's being + * zapped + */ + kvm_tdp_mmu_put_root(kvm, root, true); + } + + read_unlock(&kvm->mmu_lock); } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously. Using a separate workqueue makes it - * easy to ensure that the destruction is performed before the "fast zap" - * completes, without keeping a separate list of invalidated roots; the list is - * effectively the list of work items in the workqueue. + * zapping is done separately so that it happens with mmu_lock with read, + * whereas invalidating roots must be done with mmu_lock held for write (unless + * the VM is being destroyed). * - * Note, the asynchronous worker is gifted the TDP MMU's reference. + * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) @@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) /* * As above, mmu_lock isn't held when destroying the VM! There can't * be other references to @kvm, i.e. nothing else can invalidate roots - * or be consuming roots, but walking the list of roots does need to be - * guarded against roots being deleted by the asynchronous zap worker. + * or get/put references to roots. */ - rcu_read_lock(); - - list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + /* + * Note, invalid roots can outlive a memslot update! Invalid + * roots must be *zapped* before the memslot update completes, + * but a different task can acquire a reference and keep the + * root alive after its been zapped. + */ if (!root->role.invalid) { + root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; - tdp_mmu_schedule_zap_root(kvm, root); } } - - rcu_read_unlock(); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index bc088953f929..733a3aef3a96 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,7 +7,7 @@ #include "spte.h" -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c9c81e82e65..9f18b06bbda6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12308,9 +12308,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out; - ret = kvm_mmu_init_vm(kvm); - if (ret) - goto out_page_track; + kvm_mmu_init_vm(kvm); ret = static_call(kvm_x86_vm_init)(kvm); if (ret) @@ -12355,7 +12353,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) out_uninit_mmu: kvm_mmu_uninit_vm(kvm); -out_page_track: kvm_page_track_cleanup(kvm); out: return ret; -- cgit v1.2.3 From e8d93d5d93f85949e7299be289c6e7e1154b2f78 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Sep 2023 17:06:34 -0400 Subject: KVM: SVM: INTERCEPT_RDTSCP is never intercepted anyway svm_recalc_instruction_intercepts() is always called at least once before the vCPU is started, so the setting or clearing of the RDTSCP intercept can be dropped from the TSC_AUX virtualization support. Extracted from a patch by Tom Lendacky. Cc: stable@vger.kernel.org Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b9a0a939d59f..fa1fb81323b5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3027,11 +3027,8 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { + guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); - if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) - svm_clr_intercept(svm, INTERCEPT_RDTSCP); - } } void sev_init_vmcb(struct vcpu_svm *svm) -- cgit v1.2.3 From e0096d01c4fcb8c96c05643cfc2c20ab78eae4da Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 15 Sep 2023 15:54:30 -0500 Subject: KVM: SVM: Fix TSC_AUX virtualization setup The checks for virtualizing TSC_AUX occur during the vCPU reset processing path. However, at the time of initial vCPU reset processing, when the vCPU is first created, not all of the guest CPUID information has been set. In this case the RDTSCP and RDPID feature support for the guest is not in place and so TSC_AUX virtualization is not established. This continues for each vCPU created for the guest. On the first boot of an AP, vCPU reset processing is executed as a result of an APIC INIT event, this time with all of the guest CPUID information set, resulting in TSC_AUX virtualization being enabled, but only for the APs. The BSP always sees a TSC_AUX value of 0 which probably went unnoticed because, at least for Linux, the BSP TSC_AUX value is 0. Move the TSC_AUX virtualization enablement out of the init_vmcb() path and into the vcpu_after_set_cpuid() path to allow for proper initialization of the support after the guest CPUID information has been set. With the TSC_AUX virtualization support now in the vcpu_set_after_cpuid() path, the intercepts must be either cleared or set based on the guest CPUID input. Fixes: 296d5a17e793 ("KVM: SEV-ES: Use V_TSC_AUX if available instead of RDTSC/MSR_TSC_AUX intercepts") Signed-off-by: Tom Lendacky Message-Id: <4137fbcb9008951ab5f0befa74a0399d2cce809a.1694811272.git.thomas.lendacky@amd.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 31 ++++++++++++++++++++++++++----- arch/x86/kvm/svm/svm.c | 9 ++------- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 29 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index fa1fb81323b5..4900c078045a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2962,6 +2962,32 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) count, in); } +static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); + } +} + +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_cpuid_entry2 *best; + + /* For sev guests, the memory encryption bit is not reserved in CR3. */ + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); + if (best) + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); + + if (sev_es_guest(svm->vcpu.kvm)) + sev_es_vcpu_after_set_cpuid(svm); +} + static void sev_es_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -3024,11 +3050,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && - (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f283eb47f6ac..aef1ddf0b705 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4284,7 +4284,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *best; /* * SVM doesn't provide a way to disable just XSAVES in the guest, KVM @@ -4328,12 +4327,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); - /* For sev guests, the memory encryption bit is not reserved in CR3. */ - if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F); - if (best) - vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); - } + if (sev_guest(vcpu->kvm)) + sev_vcpu_after_set_cpuid(svm); init_vmcb_after_set_cpuid(vcpu); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f41253958357..be67ab7fdd10 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -684,6 +684,7 @@ void __init sev_hardware_setup(void); void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); void sev_init_vmcb(struct vcpu_svm *svm); +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -- cgit v1.2.3 From 916e3e5f26abc165437950daff370c0693572ef4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 15 Sep 2023 15:54:32 -0500 Subject: KVM: SVM: Do not use user return MSR support for virtualized TSC_AUX When the TSC_AUX MSR is virtualized, the TSC_AUX value is swap type "B" within the VMSA. This means that the guest value is loaded on VMRUN and the host value is restored from the host save area on #VMEXIT. Since the value is restored on #VMEXIT, the KVM user return MSR support for TSC_AUX can be replaced by populating the host save area with the current host value of TSC_AUX. And, since TSC_AUX is not changed by Linux post-boot, the host save area can be set once in svm_hardware_enable(). This eliminates the two WRMSR instructions associated with the user return MSR support. Signed-off-by: Tom Lendacky Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index aef1ddf0b705..9507df93f410 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -683,6 +683,21 @@ static int svm_hardware_enable(void) amd_pmu_enable_virt(); + /* + * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type + * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. + * Since Linux does not change the value of TSC_AUX once set, prime the + * TSC_AUX field now to avoid a RDMSR on every vCPU run. + */ + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + struct sev_es_save_area *hostsa; + u32 msr_hi; + + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); + + rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); + } + return 0; } @@ -1532,7 +1547,14 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); - if (likely(tsc_aux_uret_slot >= 0)) + /* + * TSC_AUX is always virtualized for SEV-ES guests when the feature is + * available. The user return MSR support is not required in this case + * because TSC_AUX is restored on #VMEXIT from the host save area + * (which has been initialized in svm_hardware_enable()). + */ + if (likely(tsc_aux_uret_slot >= 0) && + (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); svm->guest_state_loaded = true; @@ -3086,6 +3108,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: + /* + * TSC_AUX is always virtualized for SEV-ES guests when the + * feature is available. The user return MSR support is not + * required in this case because TSC_AUX is restored on #VMEXIT + * from the host save area (which has been initialized in + * svm_hardware_enable()). + */ + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) + break; + /* * TSC_AUX is usually changed only during boot and never read * directly. Intercept TSC_AUX instead of exposing it to the -- cgit v1.2.3