diff options
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 91 |
1 files changed, 72 insertions, 19 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d6cdeab1f8a..992e651540e8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -47,18 +47,18 @@ #include <linux/kern_levels.h> #include <linux/kstrtox.h> #include <linux/kthread.h> +#include <linux/wordpart.h> #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/set_memory.h> +#include <asm/spec-ctrl.h> #include <asm/vmx.h> #include "trace.h" -extern bool itlb_multihit_kvm_mitigation; - static bool nx_hugepage_mitigation_hard_disabled; int __read_mostly nx_huge_pages = -1; @@ -263,7 +263,7 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { - if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); return mmu->get_guest_pgd(vcpu); @@ -3110,7 +3110,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the - * walk into the weeks, e.g. p*d_large() returns false (sees the old + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ @@ -3126,7 +3126,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, if (pud_none(pud) || !pud_present(pud)) goto out; - if (pud_large(pud)) { + if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } @@ -3135,7 +3135,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, if (pmd_none(pmd) || !pmd_present(pmd)) goto out; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: @@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON_ONCE(!sp)) return; - if (is_tdp_mmu_page(sp)) + if (is_tdp_mmu_page(sp)) { + lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); - else if (!--sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } else { + lockdep_assert_held_write(&kvm->mmu_lock); + if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { + bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; @@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, return; } - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, mmu->root.pgd = 0; } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) { + read_unlock(&kvm->mmu_lock); + WARN_ON_ONCE(!list_empty(&invalid_list)); + } else { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; + if (tdp_mmu_enabled) + return kvm_tdp_mmu_alloc_root(vcpu); + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; - if (tdp_mmu_enabled) { - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - mmu->root.hpa = root; - } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { @@ -4405,6 +4418,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); + /* + * Check for a relevant mmu_notifier invalidation event before getting + * the pfn from the primary MMU, and before acquiring mmu_lock. + * + * For mmu_lock, if there is an in-progress invalidation and the kernel + * allows preemption, the invalidation task may drop mmu_lock and yield + * in response to mmu_lock being contended, which is *very* counter- + * productive as this vCPU can't actually make forward progress until + * the invalidation completes. + * + * Retrying now can also avoid unnessary lock contention in the primary + * MMU, as the primary MMU doesn't necessarily hold a single lock for + * the duration of the invalidation, i.e. faulting in a conflicting pfn + * can cause the invalidation to take longer by holding locks that are + * needed to complete the invalidation. + * + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM + * will never yield mmu_lock in response to contention, as this vCPU is + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held + * to detect retry guarantees the worst case latency for the vCPU. + */ + if (fault->slot && + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + return RET_PF_RETRY; + ret = __kvm_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4415,6 +4453,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(!fault->slot)) return kvm_handle_noslot_fault(vcpu, fault, access); + /* + * Check again for a relevant mmu_notifier invalidation event purely to + * avoid contending mmu_lock. Most invalidations will be detected by + * the previous check, but checking is extremely cheap relative to the + * overall cost of failing to detect the invalidation until after + * mmu_lock is acquired. + */ + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { + kvm_release_pfn_clean(fault->pfn); + return RET_PF_RETRY; + } + return RET_PF_CONTINUE; } @@ -4442,6 +4492,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; + /* + * Check for a relevant mmu_notifier invalidation event one last time + * now that mmu_lock is held, as the "unsafe" checks performed without + * holding mmu_lock can get false negatives. + */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } @@ -6997,9 +7052,7 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, SLAB_ACCOUNT, NULL); + pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; |