From 762b33eb90c9cc9227bf035cf2cf6f1458afecdb Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 23 May 2023 11:29:47 +0800 Subject: KVM: x86/mmu: Assert on @mmu in the __kvm_mmu_invalidate_addr() Add assertion to track that "mmu == vcpu->arch.mmu" is always true in the context of __kvm_mmu_invalidate_addr(). for_each_shadow_entry_using_root() and kvm_sync_spte() operate on vcpu->arch.mmu, but the only reason that doesn't cause explosions is because handle_invept() frees roots instead of doing a manual invalidation. As of now, there are no major roadblocks to switching INVEPT emulation over to use kvm_mmu_invalidate_addr(). Suggested-by: Sean Christopherson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230523032947.60041-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kvm/mmu/mmu.c') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8961f45e3b1..258f12235874 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5797,6 +5797,14 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu vcpu_clear_mmio_info(vcpu, addr); + /* + * Walking and synchronizing SPTEs both assume they are operating in + * the context of the current MMU, and would need to be reworked if + * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. + */ + if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) + return; + if (!VALID_PAGE(root_hpa)) return; -- cgit v1.2.3 From 0a8a5f2c8c266e9d94fb45f76a26cff135d0051c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:15:17 -0700 Subject: KVM: x86: Use standard mmu_notifier invalidate hooks for APIC access page Now that KVM honors past and in-progress mmu_notifier invalidations when reloading the APIC-access page, use KVM's "standard" invalidation hooks to trigger a reload and delete the one-off usage of invalidate_range(). Aside from eliminating one-off code in KVM, dropping KVM's use of invalidate_range() will allow common mmu_notifier to redefine the API to be more strictly focused on invalidating secondary TLBs that share the primary MMU's page tables. Suggested-by: Jason Gunthorpe Cc: Alistair Popple Cc: Robin Murphy Reviewed-by: Alistair Popple Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230602011518.787006-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/x86.c | 14 -------------- include/linux/kvm_host.h | 3 --- virt/kvm/kvm_main.c | 18 ------------------ 4 files changed, 3 insertions(+), 35 deletions(-) (limited to 'arch/x86/kvm/mmu/mmu.c') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8961f45e3b1..01a11ce68e57 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1600,6 +1600,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + if (range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + return flush; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..f962b7e3487e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10435,20 +10435,6 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ - unsigned long apic_address; - - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (start <= apic_address && apic_address < end) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e571e973bc2..cb66f4100be7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2237,9 +2237,6 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..f3c7c3c90161 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -154,11 +154,6 @@ static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ -} - __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } @@ -521,18 +516,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int idx; - - idx = srcu_read_lock(&kvm->srcu); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); - srcu_read_unlock(&kvm->srcu, idx); -} - typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, @@ -892,7 +875,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .invalidate_range = kvm_mmu_notifier_invalidate_range, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From 0a3869e14d4a5e1016aad6bc6c5b70f82bc0dbbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:15:18 -0700 Subject: KVM: x86/mmu: Trigger APIC-access page reload iff vendor code cares Request an APIC-access page reload when the backing page is migrated (or unmapped) if and only if vendor code actually plugs the backing pfn into structures that reside outside of KVM's MMU. This avoids kicking all vCPUs in the (hopefully infrequent) scenario where the backing page is migrated/invalidated. Unlike VMX's APICv, SVM's AVIC doesn't plug the backing pfn directly into the VMCB and so doesn't need a hook to invalidate an out-of-MMU "mapping". Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230602011518.787006-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu/mmu.c') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 01a11ce68e57..beb507d82adf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1600,7 +1600,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - if (range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + if (kvm_x86_ops.set_apic_access_page_addr && + range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); return flush; -- cgit v1.2.3 From 0b210faf337314e4bc88e796218bc70c72a51209 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 17:58:59 -0700 Subject: KVM: x86/mmu: Add "never" option to allow sticky disabling of nx_huge_pages Add a "never" option to the nx_huge_pages module param to allow userspace to do a one-way hard disabling of the mitigation, and don't create the per-VM recovery threads when the mitigation is hard disabled. Letting userspace pinky swear that userspace doesn't want to enable NX mitigation (without reloading KVM) allows certain use cases to avoid the latency problems associated with spawning a kthread for each VM. E.g. in FaaS use cases, the guest kernel is trusted and the host may create 100+ VMs per logical CPU, which can result in 100ms+ latencies when a burst of VMs is created. Reported-by: Li RongQing Closes: https://lore.kernel.org/all/1679555884-32544-1-git-send-email-lirongqing@baidu.com Cc: Yong He Cc: Robert Hoo Cc: Kai Huang Reviewed-by: Robert Hoo Acked-by: Kai Huang Tested-by: Luiz Capitulino Reviewed-by: Li RongQing Link: https://lore.kernel.org/r/20230602005859.784190-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/mmu/mmu.c') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 258f12235874..a2d0d06a8768 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,8 @@ extern bool itlb_multihit_kvm_mitigation; +static bool nx_hugepage_mitigation_hard_disabled; + int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT @@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, - .get = param_get_bool, + .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { @@ -6852,6 +6855,14 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) +{ + if (nx_hugepage_mitigation_hard_disabled) + return sprintf(buffer, "never\n"); + + return param_get_bool(buffer, kp); +} + static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -6868,15 +6879,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) bool old_val = nx_huge_pages; bool new_val; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + /* In "auto" mode deploy workaround only if CPU has the bug. */ - if (sysfs_streq(val, "off")) + if (sysfs_streq(val, "off")) { new_val = 0; - else if (sysfs_streq(val, "force")) + } else if (sysfs_streq(val, "force")) { new_val = 1; - else if (sysfs_streq(val, "auto")) + } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); - else if (kstrtobool(val, &new_val) < 0) + } else if (sysfs_streq(val, "never")) { + new_val = 0; + + mutex_lock(&kvm_lock); + if (!list_empty(&vm_list)) { + mutex_unlock(&kvm_lock); + return -EBUSY; + } + nx_hugepage_mitigation_hard_disabled = true; + mutex_unlock(&kvm_lock); + } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; + } __set_nx_huge_pages(new_val); @@ -7014,6 +7039,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel uint old_period, new_period; int err; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); @@ -7169,6 +7197,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; + if (nx_hugepage_mitigation_hard_disabled) + return 0; + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", &kvm->arch.nx_huge_page_recovery_thread); -- cgit v1.2.3