From 8569992d64b8f750e34b7858eac5d7daaf0f80fd Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:45 -0700 Subject: KVM: Use gfn instead of hva for mmu_notifier_retry Currently in mmu_notifier invalidate path, hva range is recorded and then checked against by mmu_invalidate_retry_hva() in the page fault handling path. However, for the soon-to-be-introduced private memory, a page fault may not have a hva associated, checking gfn(gpa) makes more sense. For existing hva based shared memory, gfn is expected to also work. The only downside is when aliasing multiple gfns to a single hva, the current algorithm of checking multiple ranges could result in a much larger range being rejected. Such aliasing should be uncommon, so the impact is expected small. Suggested-by: Sean Christopherson Cc: Xu Yilun Signed-off-by: Chao Peng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba [sean: convert vmx_set_apic_access_page_addr() to gfn-based API] Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Xu Yilun Message-Id: <20231027182217.3615211-4-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++++---- arch/x86/kvm/vmx/vmx.c | 11 +++++------ 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b0f01d605617..b2d916f786ca 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3056,7 +3056,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * * There are several ways to safely use this helper: * - * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -4366,7 +4366,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); + mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -6260,7 +6260,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_mmu_invalidate_begin(kvm, 0, -1ul); + kvm_mmu_invalidate_begin(kvm); + + kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6270,7 +6272,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); - kvm_mmu_invalidate_end(kvm, 0, -1ul); + kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be20a60047b1..40e3780d73ae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6757,10 +6757,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) return; /* - * Grab the memslot so that the hva lookup for the mmu_notifier retry - * is guaranteed to use the same memslot as the pfn lookup, i.e. rely - * on the pfn lookup's validation of the memslot to ensure a valid hva - * is used for the retry check. + * Explicitly grab the memslot using KVM's internal slot ID to ensure + * KVM doesn't unintentionally grab a userspace memslot. It _should_ + * be impossible for userspace to create a memslot for the APIC when + * APICv is enabled, but paranoia won't hurt in this case. */ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) @@ -6785,8 +6785,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_hva(kvm, mmu_seq, - gfn_to_hva_memslot(slot, gfn))) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); read_unlock(&vcpu->kvm->mmu_lock); goto out; -- cgit v1.2.3 From 1853d7502a19b34b2cfe4ea0698bee73323fec3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:47 -0700 Subject: KVM: PPC: Drop dead code related to KVM_ARCH_WANT_MMU_NOTIFIER Assert that both KVM_ARCH_WANT_MMU_NOTIFIER and CONFIG_MMU_NOTIFIER are defined when KVM is enabled, and return '1' unconditionally for the CONFIG_KVM_BOOK3S_HV_POSSIBLE=n path. All flavors of PPC support for KVM select MMU_NOTIFIER, and KVM_ARCH_WANT_MMU_NOTIFIER is unconditionally defined by arch/powerpc/include/asm/kvm_host.h. Effectively dropping use of KVM_ARCH_WANT_MMU_NOTIFIER will simplify a future cleanup to turn KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig, i.e. will allow combining all of the #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) checks into a single #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER without having to worry about PPC's "bare" usage of KVM_ARCH_WANT_MMU_NOTIFIER. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Message-Id: <20231027182217.3615211-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7197c8256668..b0a512ede764 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -632,12 +632,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_MMU: +#if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) + BUILD_BUG(); +#endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE r = hv_enabled; -#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) - r = 1; #else - r = 0; + r = 1; #endif break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -- cgit v1.2.3 From 4a2e993faad3880962540ab8e3b68f22e48b18e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:48 -0700 Subject: KVM: PPC: Return '1' unconditionally for KVM_CAP_SYNC_MMU Advertise that KVM's MMU is synchronized with the primary MMU for all flavors of PPC KVM support, i.e. advertise that the MMU is synchronized when CONFIG_KVM_BOOK3S_HV_POSSIBLE=y but the VM is not using hypervisor mode (a.k.a. PR VMs). PR VMs, via kvm_unmap_gfn_range_pr(), do the right thing for mmu_notifier invalidation events, and more tellingly, KVM returns '1' for KVM_CAP_SYNC_MMU when CONFIG_KVM_BOOK3S_HV_POSSIBLE=n and CONFIG_KVM_BOOK3S_PR_POSSIBLE=y, i.e. KVM already advertises a synchronized MMU for PR VMs, just not when CONFIG_KVM_BOOK3S_HV_POSSIBLE=y. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b0a512ede764..8d3ec483bc2b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -635,11 +635,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) BUILD_BUG(); #endif -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - r = hv_enabled; -#else r = 1; -#endif break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: -- cgit v1.2.3 From f128cf8cfbecccf95e891ae90d9c917df5117c7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:49 -0700 Subject: KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to CONFIG_KVM_GENERIC_MMU_NOTIFIER Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where appropriate to effectively maintain existing behavior. Using a proper Kconfig will simplify building more functionality on top of KVM's mmu_notifier infrastructure. Add a forward declaration of kvm_gfn_range to kvm_types.h so that including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't generate warnings due to kvm_gfn_range being undeclared. PPC defines hooks for PR vs. HV without guarding them via #ifdeffery, e.g. bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); Alternatively, PPC could forward declare kvm_gfn_range, but there's no good reason not to define it in common KVM. Acked-by: Anup Patel Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 2 -- arch/arm64/kvm/Kconfig | 2 +- arch/loongarch/include/asm/kvm_host.h | 1 - arch/loongarch/kvm/Kconfig | 2 +- arch/mips/include/asm/kvm_host.h | 2 -- arch/mips/kvm/Kconfig | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kvm/Kconfig | 8 ++++---- arch/powerpc/kvm/powerpc.c | 4 +--- arch/riscv/include/asm/kvm_host.h | 2 -- arch/riscv/kvm/Kconfig | 2 +- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/Kconfig | 2 +- include/linux/kvm_host.h | 6 +++--- include/linux/kvm_types.h | 1 + virt/kvm/Kconfig | 4 ++++ virt/kvm/kvm_main.c | 10 +++++----- 17 files changed, 23 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5653d3553e3e..9029fe09f3f6 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -954,8 +954,6 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); -#define KVM_ARCH_WANT_MMU_NOTIFIER - void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 83c1e09be42e..1a777715199f 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,7 +22,7 @@ menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select KVM_GENERIC_HARDWARE_ENABLING - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 11328700d4fa..b108301c2e5a 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -183,7 +183,6 @@ void kvm_flush_tlb_all(void); void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa); int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write); -#define KVM_ARCH_WANT_MMU_NOTIFIER void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index fda425babfb2..f22bae89b07d 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -26,9 +26,9 @@ config KVM select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO select KVM_XFER_TO_GUEST_WORK - select MMU_NOTIFIER select PREEMPT_NOTIFIERS help Support hosting virtualized guest machines using diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 54a85f1d4f2c..179f320cc231 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -810,8 +810,6 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); -#define KVM_ARCH_WANT_MMU_NOTIFIER - /* Emulation */ enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index a8cdba75f98d..c04987d2ed2e 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -25,7 +25,7 @@ config KVM select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select INTERVAL_TREE select KVM_GENERIC_HARDWARE_ENABLING help diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 14ee0dece853..4b5c3f2acf78 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -62,8 +62,6 @@ #include -#define KVM_ARCH_WANT_MMU_NOTIFIER - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 902611954200..b33358ee6424 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -42,7 +42,7 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER config KVM_BOOK3S_HV_POSSIBLE bool @@ -85,7 +85,7 @@ config KVM_BOOK3S_64_HV tristate "KVM for POWER7 and later using hypervisor mode in host" depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select CMA help Support running unmodified book3s_64 guest kernels in @@ -194,7 +194,7 @@ config KVM_E500V2 depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -211,7 +211,7 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500MC/E5500/E6500 guest kernels in virtual machines on E500MC/E5500/E6500 host processors. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8d3ec483bc2b..aac75c98a956 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -632,9 +632,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_MMU: -#if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) - BUILD_BUG(); -#endif + BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER)); r = 1; break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 0eefd9c991ae..6964dd235e97 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -267,8 +267,6 @@ struct kvm_vcpu_arch { static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -#define KVM_ARCH_WANT_MMU_NOTIFIER - #define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index dfc237d7875b..ae2e05f050ec 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,7 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO select KVM_XFER_TO_GUEST_WORK - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select PREEMPT_NOTIFIERS help Support hosting virtualized guest machines. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d7036982332e..6f559fb75e6d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2141,8 +2141,6 @@ enum { # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif -#define KVM_ARCH_WANT_MMU_NOTIFIER - int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 950c12868d30..e61383674c75 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -24,7 +24,7 @@ config KVM depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 11d091688346..5faba69403ac 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,7 +253,7 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { pte_t pte; }; @@ -783,7 +783,7 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; @@ -1946,7 +1946,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 6f4737d5046a..9d1f7835d8c1 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -6,6 +6,7 @@ struct kvm; struct kvm_async_pf; struct kvm_device_ops; +struct kvm_gfn_range; struct kvm_interrupt; struct kvm_irq_routing_table; struct kvm_memory_slot; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 484d0873061c..ecae2914c97e 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,7 @@ config HAVE_KVM_PM_NOTIFIER config KVM_GENERIC_HARDWARE_ENABLING bool + +config KVM_GENERIC_MMU_NOTIFIER + select MMU_NOTIFIER + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5422ce20dcba..dc81279ea385 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,7 +535,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); @@ -962,14 +962,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ +#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ +#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, @@ -1289,7 +1289,7 @@ out_err: out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif @@ -1349,7 +1349,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() -- cgit v1.2.3 From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2 Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 22 ++++++++++++++++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 +-- include/uapi/linux/kvm.h | 13 ++++++++++ virt/kvm/kvm_main.c | 57 ++++++++++++++++++++++++++++++++++++------ 5 files changed, 87 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7025b3751027..9edd9e436bab 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6192,6 +6192,28 @@ to know what fields can be changed for the system register described by ``op0, op1, crn, crm, op2``. KVM rejects ID register values that describe a superset of the features supported by the system. +4.140 KVM_SET_USER_MEMORY_REGION2 +--------------------------------- + +:Capability: KVM_CAP_USER_MEMORY2 +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region2 (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ + __u64 pad[16]; + }; + +See KVM_SET_USER_MEMORY_REGION. + 5. The kvm_run structure ======================== diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c924075f6f1..7b389f27dffc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12576,7 +12576,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5faba69403ac..4e741ff27af3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1146,9 +1146,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 211b86de35ac..308cc70bd6ab 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1201,6 +1211,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_USER_MEMORY2 231 #ifdef KVM_CAP_IRQ_ROUTING @@ -1483,6 +1494,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dc81279ea385..756b94ecd511 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1580,7 +1580,15 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; @@ -1982,7 +1990,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2086,7 +2094,7 @@ int __kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { int r; @@ -2098,7 +2106,7 @@ int kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -4568,6 +4576,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: + case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: @@ -4823,6 +4832,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) return fd; } +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ + offsetof(struct kvm_userspace_memory_region2, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region2, field)); \ +} while (0) + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4845,15 +4862,39 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } + case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_userspace_memory_region2 mem; + unsigned long size; + + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); + size = sizeof(struct kvm_userspace_memory_region); + } else { + size = sizeof(struct kvm_userspace_memory_region2); + } + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { -- cgit v1.2.3 From 16f95f3b95caded251a0440051e44a2fbe9e5f55 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:51 -0700 Subject: KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace Add a new KVM exit type to allow userspace to handle memory faults that KVM cannot resolve, but that userspace *may* be able to handle (without terminating the guest). KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit conversions between private and shared memory. With guest private memory, there will be two kind of memory conversions: - explicit conversion: happens when the guest explicitly calls into KVM to map a range (as private or shared) - implicit conversion: happens when the guest attempts to access a gfn that is configured in the "wrong" state (private vs. shared) On x86 (first architecture to support guest private memory), explicit conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE, but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable as there is (obviously) no hypercall, and there is no guarantee that the guest actually intends to convert between private and shared, i.e. what KVM thinks is an implicit conversion "request" could actually be the result of a guest code bug. KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to be implicit conversions. Note! To allow for future possibilities where KVM reports KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's perspective), not '0'! Due to historical baggage within KVM, exiting to userspace with '0' from deep callstacks, e.g. in emulation paths, is infeasible as doing so would require a near-complete overhaul of KVM, whereas KVM already propagates -errno return codes to userspace even when the -errno originated in a low level helper. Report the gpa+size instead of a single gfn even though the initial usage is expected to always report single pages. It's entirely possible, likely even, that KVM will someday support sub-page granularity faults, e.g. Intel's sub-page protection feature allows for additional protections at 128-byte granularity. Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com Cc: Anish Moorthy Cc: David Matlack Suggested-by: Sean Christopherson Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-10-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 41 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 11 +++++++++++ include/uapi/linux/kvm.h | 8 ++++++++ 4 files changed, 61 insertions(+) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9edd9e436bab..27d945d5b4e4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6846,6 +6846,26 @@ array field represents return values. The userspace should update the return values of SBI call before resuming the VCPU. For more details on RISC-V SBI spec refer, https://github.com/riscv/riscv-sbi-doc. +:: + + /* KVM_EXIT_MEMORY_FAULT */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; + +KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that +could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the +guest physical address range [gpa, gpa + size) of the fault. The 'flags' field +describes properties of the faulting access that are likely pertinent. +Currently, no flags are defined. + +Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it +accompanies a return code of '-1', not '0'! errno will always be set to EFAULT +or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT, userspace should assume +kvm_run.exit_reason is stale/undefined for all other error numbers. + :: /* KVM_EXIT_NOTIFY */ @@ -7880,6 +7900,27 @@ This capability is aimed to mitigate the threat that malicious VMs can cause CPU stuck (due to event windows don't open up) and make the CPU unavailable to host or other VMs. +7.34 KVM_CAP_MEMORY_FAULT_INFO +------------------------------ + +:Architectures: x86 +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. + +The presence of this capability indicates that KVM_RUN will fill +kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if +there is a valid memslot but no backing VMA for the corresponding host virtual +address. + +The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns +an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set +to KVM_EXIT_MEMORY_FAULT. + +Note: Userspaces which attempt to resolve memory faults so that they can retry +KVM_RUN are encouraged to guard against repeatedly receiving the same +error/annotated fault. + +See KVM_EXIT_MEMORY_FAULT for more information. + 8. Other capabilities. ====================== diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7b389f27dffc..8f9d8939b63b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4625,6 +4625,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_MEMORY_FAULT_INFO: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e741ff27af3..96aa930536b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2327,4 +2327,15 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 +static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t size) +{ + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + vcpu->run->memory_fault.gpa = gpa; + vcpu->run->memory_fault.size = size; + + /* Flags are not (yet) defined or communicated to userspace. */ + vcpu->run->memory_fault.flags = 0; +} + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 308cc70bd6ab..59010a685007 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -275,6 +275,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 +#define KVM_EXIT_MEMORY_FAULT 39 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -528,6 +529,12 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; + /* KVM_EXIT_MEMORY_FAULT */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; /* Fix the size of the union. */ char padding[256]; }; @@ -1212,6 +1219,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_MEMORY_FAULT_INFO 232 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From ee605e31563348f44d2ff0b6b74d33df69dd535c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:00 -0700 Subject: KVM: x86: "Reset" vcpu->run->exit_reason early in KVM_RUN Initialize run->exit_reason to KVM_EXIT_UNKNOWN early in KVM_RUN to reduce the probability of exiting to userspace with a stale run->exit_reason that *appears* to be valid. To support fd-based guest memory (guest memory without a corresponding userspace virtual address), KVM will exit to userspace for various memory related errors, which userspace *may* be able to resolve, instead of using e.g. BUS_MCEERR_AR. And in the more distant future, KVM will also likely utilize the same functionality to let userspace "intercept" and handle memory faults when the userspace mapping is missing, i.e. when fast gup() fails. Because many of KVM's internal APIs related to guest memory use '0' to indicate "success, continue on" and not "exit to userspace", reporting memory faults/errors to userspace will set run->exit_reason and corresponding fields in the run structure fields in conjunction with a a non-zero, negative return code, e.g. -EFAULT or -EHWPOISON. And because KVM already returns -EFAULT in many paths, there's a relatively high probability that KVM could return -EFAULT without setting run->exit_reason, in which case reporting KVM_EXIT_UNKNOWN is much better than reporting whatever exit reason happened to be in the run structure. Note, KVM must wait until after run->immediate_exit is serviced to sanitize run->exit_reason as KVM's ABI is that run->exit_reason is preserved across KVM_RUN when run->immediate_exit is true. Link: https://lore.kernel.org/all/20230908222905.1321305-1-amoorthy@google.com Link: https://lore.kernel.org/all/ZFFbwOXZ5uI%2Fgdaf@google.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-19-seanjc@google.com> Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f9d8939b63b..f661acb01c58 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11082,6 +11082,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->arch.l1tf_flush_l1d = true; for (;;) { -- cgit v1.2.3 From 90b4fe17981e155432c4dbc490606d0c2e9c2199 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:01 -0700 Subject: KVM: x86: Disallow hugepages when memory attributes are mixed Disallow creating hugepages with mixed memory attributes, e.g. shared versus private, as mapping a hugepage in this case would allow the guest to access memory with the wrong attributes, e.g. overlaying private memory with a shared hugepage. Tracking whether or not attributes are mixed via the existing disallow_lpage field, but use the most significant bit in 'disallow_lpage' to indicate a hugepage has mixed attributes instead using the normal refcounting. Whether or not attributes are mixed is binary; either they are or they aren't. Attempting to squeeze that info into the refcount is unnecessarily complex as it would require knowing the previous state of the mixed count when updating attributes. Using a flag means KVM just needs to ensure the current status is reflected in the memslots. Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/mmu/mmu.c | 154 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 4 ++ 3 files changed, 159 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6f559fb75e6d..fa0d42202405 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1848,6 +1848,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b2d916f786ca..f5c6b0643645 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -795,16 +795,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +/* + * The most significant bit in disallow_lpage tracks whether or not memory + * attributes are mixed, i.e. not identical for all gfns at the current level. + * The lower order bits are used to refcount other cases where a hugepage is + * disallowed, e.g. if KVM has shadow a page table at the gfn. + */ +#define KVM_LPAGE_MIXED_FLAG BIT(31) + static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; - int i; + int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); + + old = linfo->disallow_lpage; linfo->disallow_lpage += count; - WARN_ON_ONCE(linfo->disallow_lpage < 0); + WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } @@ -7176,3 +7186,143 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) if (kvm->arch.nx_huge_page_recovery_thread) kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } + +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long attrs) +{ + const unsigned long start = gfn; + const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); + + if (level == PG_LEVEL_2M) + return kvm_range_has_memory_attributes(kvm, start, end, attrs); + + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { + if (hugepage_test_mixed(slot, gfn, level - 1) || + attrs != kvm_get_memory_attributes(kvm, gfn)) + return false; + } + return true; +} + +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + unsigned long attrs = range->arg.attributes; + struct kvm_memory_slot *slot = range->slot; + int level; + + lockdep_assert_held_write(&kvm->mmu_lock); + lockdep_assert_held(&kvm->slots_lock); + + /* + * Calculate which ranges can be mapped with hugepages even if the slot + * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over + * a range that has PRIVATE GFNs, and conversely converting a range to + * SHARED may now allow hugepages. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + /* + * The sequence matters here: upper levels consume the result of lower + * level's scanning. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn = gfn_round_for_level(range->start, level); + + /* Process the head page if it straddles the range. */ + if (gfn != range->start || gfn + nr_pages > range->end) { + /* + * Skip mixed tracking if the aligned gfn isn't covered + * by the memslot, KVM can't use a hugepage due to the + * misaligned address regardless of memory attributes. + */ + if (gfn >= slot->base_gfn) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + gfn += nr_pages; + } + + /* + * Pages entirely covered by the range are guaranteed to have + * only the attributes which were just set. + */ + for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) + hugepage_clear_mixed(slot, gfn, level); + + /* + * Process the last tail page if it straddles the range and is + * contained by the memslot. Like the head page, KVM can't + * create a hugepage if the slot size is misaligned. + */ + if (gfn < range->end && + (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } + return false; +} + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + int level; + + if (!kvm_arch_has_private_mem(kvm)) + return; + + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + /* + * Don't bother tracking mixed attributes for pages that can't + * be huge due to alignment, i.e. process only pages that are + * entirely contained by the memslot. + */ + gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); + gfn_t start = gfn_round_for_level(slot->base_gfn, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn; + + if (start < slot->base_gfn) + start += nr_pages; + + /* + * Unlike setting attributes, every potential hugepage needs to + * be manually checked as the attributes may already be mixed. + */ + for (gfn = start; gfn < end; gfn += nr_pages) { + unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); + + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } +} +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f661acb01c58..e1aad0c81f6f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12728,6 +12728,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + kvm_mmu_init_memslot_memory_attributes(kvm, slot); +#endif + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; -- cgit v1.2.3 From 8dd2eee9d526c30fccfe75da7ec5365c6476e510 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:02 -0700 Subject: KVM: x86/mmu: Handle page fault for private memory Add support for resolving page faults on guest private memory for VMs that differentiate between "shared" and "private" memory. For such VMs, KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and hva-based shared memory, and KVM needs to map in the "correct" variant, i.e. KVM needs to map the gfn shared/private as appropriate based on the current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag. For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request shared vs. private via a bit in the guest page tables, i.e. what the guest wants may conflict with the current memory attributes. To support such "implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT to forward the request to userspace. Add a new flag for memory faults, KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to map memory as shared vs. private. Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to exit on missing mappings when handling guest page fault VM-Exits. In that case, userspace will want to know RWX information in order to correctly/precisely resolve the fault. Note, private memory *must* be backed by guest_memfd, i.e. shared mappings always come from the host userspace page tables, and private mappings always come from a guest_memfd instance. Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 8 +++- arch/x86/kvm/mmu/mmu.c | 101 ++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu/mmu_internal.h | 1 + include/linux/kvm_host.h | 8 +++- include/uapi/linux/kvm.h | 1 + 5 files changed, 110 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1e61faf02b2a..726c87c35d57 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6952,6 +6952,7 @@ spec refer, https://github.com/riscv/riscv-sbi-doc. /* KVM_EXIT_MEMORY_FAULT */ struct { + #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) __u64 flags; __u64 gpa; __u64 size; @@ -6960,8 +6961,11 @@ spec refer, https://github.com/riscv/riscv-sbi-doc. KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the guest physical address range [gpa, gpa + size) of the fault. The 'flags' field -describes properties of the faulting access that are likely pertinent. -Currently, no flags are defined. +describes properties of the faulting access that are likely pertinent: + + - KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred + on a private memory access. When clear, indicates the fault occurred on a + shared access. Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it accompanies a return code of '-1', not '0'! errno will always be set to EFAULT diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f5c6b0643645..754a5aaebee5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3147,9 +3147,9 @@ out: return level; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) +static int __kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; @@ -3161,6 +3161,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (is_private) + return max_level; + if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; @@ -3168,6 +3171,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + int max_level) +{ + bool is_private = kvm_slot_can_be_private(slot) && + kvm_mem_is_private(kvm, gfn); + + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); +} + void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -3188,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level); + fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->max_level, + fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -4269,6 +4283,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); } +static inline u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int max_order, r; + + if (!kvm_slot_can_be_private(fault->slot)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, + &max_order); + if (r) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return r; + } + + fault->max_level = min(kvm_max_level_for_order(max_order), + fault->max_level); + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + + return RET_PF_CONTINUE; +} + static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -4301,6 +4364,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_EMULATE; } + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (fault->is_private) + return kvm_faultin_pfn_private(vcpu, fault); + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, @@ -7188,6 +7259,26 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + /* + * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only + * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM + * can simply ignore such slots. But if userspace is making memory + * PRIVATE, then KVM must prevent the guest from accessing the memory + * as shared. And if userspace is making memory SHARED and this point + * is reached, then at least one page within the range was previously + * PRIVATE, i.e. the slot's possible hugepage ranges are changing. + * Zapping SPTEs in this case ensures KVM will reassess whether or not + * a hugepage can be used for affected ranges. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + return kvm_unmap_gfn_range(kvm, range); +} + static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index decc1f153669..86c7cb692786 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -201,6 +201,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a6de526c0426..67dfd4d79529 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2357,14 +2357,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - gpa_t gpa, gpa_t size) + gpa_t gpa, gpa_t size, + bool is_write, bool is_exec, + bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; - /* Flags are not (yet) defined or communicated to userspace. */ + /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; + if (is_private) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2802d10aa88c..8eb10f560c69 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -535,6 +535,7 @@ struct kvm_run { } notify; /* KVM_EXIT_MEMORY_FAULT */ struct { +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) __u64 flags; __u64 gpa; __u64 size; -- cgit v1.2.3 From 2333afa17af0f4b6651214ee17cfd5ae5f47787a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:03 -0700 Subject: KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of KVM_ADDRESS_SPACE_NUM. No functional change intended. Reviewed-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - include/linux/kvm_host.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fa0d42202405..061eec231299 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2136,7 +2136,6 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 67dfd4d79529..db423ea9e3a4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -690,7 +690,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#if KVM_ADDRESS_SPACE_NUM == 1 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; -- cgit v1.2.3 From eed52e434bc33603ddb0af62b6c4ef818948489d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:04 -0700 Subject: KVM: Allow arch code to track number of memslot address spaces per VM Let x86 track the number of address spaces on a per-VM basis so that KVM can disallow SMM memslots for confidential VMs. Confidentials VMs are fundamentally incompatible with emulating SMM, which as the name suggests requires being able to read and write guest memory and register state. Disallowing SMM will simplify support for guest private memory, as KVM will not need to worry about tracking memory attributes for multiple address spaces (SMM is the only "non-default" address space across all architectures). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/x86/include/asm/kvm_host.h | 8 +++++++- arch/x86/kvm/debugfs.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 17 +++++++++++------ virt/kvm/dirty_ring.c | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++------------ 8 files changed, 39 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 130bafdb1430..9b0eaa17275a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -6084,7 +6084,7 @@ static int kvmhv_svm_off(struct kvm *kvm) } srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct kvm_memory_slot *memslot; struct kvm_memslots *slots = __kvm_memslots(kvm, i); int bkt; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 061eec231299..75ab0da06e64 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2136,9 +2136,15 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define KVM_ADDRESS_SPACE_NUM 2 +# define KVM_MAX_NR_ADDRESS_SPACES 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) + +static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) +{ + return KVM_MAX_NR_ADDRESS_SPACES; +} + #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index ee8c4c3496ed..42026b3f3ff3 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -111,7 +111,7 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) mutex_lock(&kvm->slots_lock); write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { int bkt; slots = __kvm_memslots(kvm, i); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 754a5aaebee5..4de7670d5976 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3763,7 +3763,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) kvm_page_track_write_tracking_enabled(kvm)) goto out_success; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* @@ -6309,7 +6309,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (!kvm_memslots_have_rmaps(kvm)) return flush; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { @@ -6806,7 +6806,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ - gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1aad0c81f6f..f521c97f5c64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12577,7 +12577,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, hva = slot->userspace_addr; } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index db423ea9e3a4..3ebc6912c54a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -80,8 +80,8 @@ /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 -#ifndef KVM_ADDRESS_SPACE_NUM -#define KVM_ADDRESS_SPACE_NUM 1 +#ifndef KVM_MAX_NR_ADDRESS_SPACES +#define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* @@ -690,7 +690,12 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#if KVM_ADDRESS_SPACE_NUM == 1 +#if KVM_MAX_NR_ADDRESS_SPACES == 1 +static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) +{ + return KVM_MAX_NR_ADDRESS_SPACES; +} + static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; @@ -745,9 +750,9 @@ struct kvm { struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ - struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2]; + struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ - struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; + struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an @@ -1017,7 +1022,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { - as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); + as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index c1cd7dfe4a90..86d267db87bb 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -58,7 +58,7 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) as_id = slot >> 16; id = (u16)slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return; memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8f46d757a2c5..8758cb799e18 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -615,7 +615,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); @@ -1241,7 +1241,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) goto out_err_no_irq_srcu; refcount_set(&kvm->users_count, 1); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; @@ -1391,7 +1391,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } @@ -1682,7 +1682,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ - gen += KVM_ADDRESS_SPACE_NUM; + gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); @@ -2052,7 +2052,7 @@ int __kvm_set_memory_region(struct kvm *kvm, (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; @@ -2188,7 +2188,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2250,7 +2250,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2362,7 +2362,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) @@ -2493,7 +2493,7 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { @@ -4848,9 +4848,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif -#if KVM_ADDRESS_SPACE_NUM > 1 +#if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: - return KVM_ADDRESS_SPACE_NUM; + if (kvm) + return kvm_arch_nr_memslot_as_ids(kvm); + return KVM_MAX_NR_ADDRESS_SPACES; #endif case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; @@ -4958,7 +4960,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) lockdep_assert_held(&kvm->slots_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) return false; } -- cgit v1.2.3 From 89ea60c2c7b5838bf192c50062d5720cd6ab8662 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:05 -0700 Subject: KVM: x86: Add support for "protected VMs" that can utilize private memory Add a new x86 VM type, KVM_X86_SW_PROTECTED_VM, to serve as a development and testing vehicle for Confidential (CoCo) VMs, and potentially to even become a "real" product in the distant future, e.g. a la pKVM. The private memory support in KVM x86 is aimed at AMD's SEV-SNP and Intel's TDX, but those technologies are extremely complex (understatement), difficult to debug, don't support running as nested guests, and require hardware that's isn't universally accessible. I.e. relying SEV-SNP or TDX for maintaining guest private memory isn't a realistic option. At the very least, KVM_X86_SW_PROTECTED_VM will enable a variety of selftests for guest_memfd and private memory support without requiring unique hardware. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-24-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 32 ++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 15 +++++++++------ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/Kconfig | 12 ++++++++++++ arch/x86/kvm/mmu/mmu_internal.h | 1 + arch/x86/kvm/x86.c | 16 +++++++++++++++- include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 5 +++++ 8 files changed, 78 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 726c87c35d57..926241e23aeb 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -147,10 +147,29 @@ described as 'basic' will be available. The new VM has no virtual cpus and no memory. You probably want to use 0 as machine type. +X86: +^^^^ + +Supported X86 VM types can be queried via KVM_CAP_VM_TYPES. + +S390: +^^^^^ + In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). +MIPS: +^^^^^ + +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + +ARM64: +^^^^^^ + On arm64, the physical address size for a VM (IPA Size limit) is limited to 40bits by default. The limit can be configured if the host supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use @@ -8765,6 +8784,19 @@ block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a 64-bit bitmap (each bit describing a block size). The default value is 0, to disable the eager page splitting. +8.41 KVM_CAP_VM_TYPES +--------------------- + +:Capability: KVM_CAP_MEMORY_ATTRIBUTES +:Architectures: x86 +:Type: system ioctl + +This capability returns a bitmap of support VM types. The 1-setting of bit @n +means the VM type with value @n is supported. Possible values of @n are:: + + #define KVM_X86_DEFAULT_VM 0 + #define KVM_X86_SW_PROTECTED_VM 1 + 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 75ab0da06e64..a565a2e70f30 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1255,6 +1255,7 @@ enum kvm_apicv_inhibit { }; struct kvm_arch { + unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; @@ -2089,6 +2090,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); +#ifdef CONFIG_KVM_PRIVATE_MEM +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) +#else +#define kvm_arch_has_private_mem(kvm) false +#endif + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -2137,14 +2144,10 @@ enum { #define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define KVM_MAX_NR_ADDRESS_SPACES 2 +/* SMM is currently unsupported for guests with private memory. */ +# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) - -static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) -{ - return KVM_MAX_NR_ADDRESS_SPACES; -} - #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 1a6a1f987949..a448d0964fc0 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -562,4 +562,7 @@ struct kvm_pmu_event_filter { /* x86-specific KVM_EXIT_HYPERCALL flags. */ #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index e61383674c75..c1716e83d176 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -77,6 +77,18 @@ config KVM_WERROR If in doubt, say "N". +config KVM_SW_PROTECTED_VM + bool "Enable support for KVM software-protected VMs" + depends on EXPERT + depends on X86_64 + select KVM_GENERIC_PRIVATE_MEM + help + Enable support for KVM software-protected VMs. Currently "protected" + means the VM can be backed with memory provided by + KVM_CREATE_GUEST_MEMFD. + + If unsure, say "N". + config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 86c7cb692786..b66a7d47e0e4 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -297,6 +297,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, + .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), }; int r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f521c97f5c64..6d0772b47041 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4548,6 +4548,13 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, return 0; } +static bool kvm_is_vm_type_supported(unsigned long type) +{ + return type == KVM_X86_DEFAULT_VM || + (type == KVM_X86_SW_PROTECTED_VM && + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4739,6 +4746,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; break; + case KVM_CAP_VM_TYPES: + r = BIT(KVM_X86_DEFAULT_VM); + if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) + r |= BIT(KVM_X86_SW_PROTECTED_VM); + break; default: break; } @@ -12436,9 +12448,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) int ret; unsigned long flags; - if (type) + if (!kvm_is_vm_type_supported(type)) return -EINVAL; + kvm->arch.vm_type = type; + ret = kvm_page_track_init(kvm); if (ret) goto out; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8eb10f560c69..e9cb2df67a1d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 +#define KVM_CAP_VM_TYPES 235 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 08afef022db9..2c964586aa14 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -104,3 +104,8 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES config KVM_PRIVATE_MEM select XARRAY_MULTI bool + +config KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_PRIVATE_MEM + bool -- cgit v1.2.3 From d4fbbb26da520e00d87c8187dc3de9eacee66c1c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:31 +0000 Subject: KVM: arm64: Add new (V)TCR_EL2 field definitions for FEAT_LPA2 As per Arm ARM (0487I.a), (V)TCR_EL2.DS fields control whether 52 bit input and output addresses are supported on 4K and 16K page size configurations when FEAT_LPA2 is known to have been implemented. This adds these field definitions which will be used by KVM when FEAT_LPA2 is enabled. Acked-by: Catalin Marinas Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-7-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_arm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index b85f46a73e21..312cbc300831 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -108,6 +108,7 @@ #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En) /* TCR_EL2 Registers bits */ +#define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 @@ -122,6 +123,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ +#define VTCR_EL2_DS TCR_EL2_DS #define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) -- cgit v1.2.3 From bd412e2a310cbc43b424198b0065086b0f462625 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:32 +0000 Subject: KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1 Implement a simple policy whereby if the HW supports FEAT_LPA2 for the page size we are using, always use LPA2-style page-tables for stage 2 and hyp stage 1 (assuming an nvhe hyp), regardless of the VMM-requested IPA size or HW-implemented PA size. When in use we can now support up to 52-bit IPA and PA sizes. We use the previously created cpu feature to track whether LPA2 is supported for deciding whether to use the LPA2 or classic pte format. Note that FEAT_LPA2 brings support for bigger block mappings (512GB with 4KB, 64GB with 16KB). We explicitly don't enable these in the library because stage2_apply_range() works on batch sizes of the largest used block mapping, and increasing the size of the batch would lead to soft lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit stage2_apply_range() batch size to largest block"). With the addition of LPA2 support in the hypervisor, the PA size supported by the HW must be capped with a runtime decision, rather than simply using a compile-time decision based on PA_BITS. For example, on a system that advertises 52 bit PA but does not support FEAT_LPA2, A 4KB or 16KB kernel compiled with LPA2 support must still limit the PA size to 48 bits. Therefore, move the insertion of the PS field into TCR_EL2 out of __kvm_hyp_init assembly code and instead do it in cpu_prepare_hyp_mode() where the rest of TCR_EL2 is prepared. This allows us to figure out PS with kvm_get_parange(), which has the appropriate logic to ensure the above requirement. (and the PS field of VTCR_EL2 is already populated this way). Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-8-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_pgtable.h | 49 +++++++++++++++++++++++++++--------- arch/arm64/kvm/arm.c | 5 ++++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 4 --- arch/arm64/kvm/hyp/pgtable.c | 15 ++++++++--- 4 files changed, 54 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 10068500d601..69a2a87ecaf6 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,14 +25,24 @@ #define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U #endif -#define kvm_lpa2_is_enabled() false +#define kvm_lpa2_is_enabled() system_supports_lpa2() + +static inline u64 kvm_get_parange_max(void) +{ + if (kvm_lpa2_is_enabled() || + (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16)) + return ID_AA64MMFR0_EL1_PARANGE_52; + else + return ID_AA64MMFR0_EL1_PARANGE_48; +} static inline u64 kvm_get_parange(u64 mmfr0) { + u64 parange_max = kvm_get_parange_max(); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX) - parange = ID_AA64MMFR0_EL1_PARANGE_MAX; + if (parange > parange_max) + parange = parange_max; return parange; } @@ -43,6 +53,8 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PTE_ADDR_MASK_LPA2 GENMASK(49, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_50_LPA2 GENMASK(9, 8) #define KVM_PHYS_INVALID (-1ULL) @@ -53,21 +65,34 @@ static inline bool kvm_pte_valid(kvm_pte_t pte) static inline u64 kvm_pte_to_phys(kvm_pte_t pte) { - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + u64 pa; + + if (kvm_lpa2_is_enabled()) { + pa = pte & KVM_PTE_ADDR_MASK_LPA2; + pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50; + } else { + pa = pte & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + } return pa; } static inline kvm_pte_t kvm_phys_to_pte(u64 pa) { - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) { - pa &= GENMASK(51, 48); - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + kvm_pte_t pte; + + if (kvm_lpa2_is_enabled()) { + pte = pa & KVM_PTE_ADDR_MASK_LPA2; + pa &= GENMASK(51, 50); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50); + } else { + pte = pa & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } } return pte; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e5f75f1f1085..c4bbc224549b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1837,6 +1837,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); + u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); unsigned long tcr; /* @@ -1859,6 +1860,10 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) } tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); + tcr &= ~TCR_EL2_PS_MASK; + tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0)); + if (kvm_lpa2_is_enabled()) + tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 1cc06e6797bd..f62a7d360285 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -122,11 +122,7 @@ alternative_if ARM64_HAS_CNP alternative_else_nop_endif msr ttbr0_el2, x2 - /* - * Set the PS bits in TCR_EL2. - */ ldr x0, [x0, #NVHE_INIT_TCR_EL2] - tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 msr tcr_el2, x0 isb diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 1966fdee740e..ce9a58cb02fd 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -79,7 +79,10 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) static bool kvm_phys_is_valid(u64 phys) { - return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); + u64 parange_max = kvm_get_parange_max(); + u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); + + return phys < BIT(shift); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) @@ -408,7 +411,8 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -654,6 +658,9 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ + if (kvm_lpa2_is_enabled()) + vtcr |= VTCR_EL2_DS; + /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS_16BIT : @@ -711,7 +718,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; -- cgit v1.2.3 From 419edf48d79f6fb2cc3fa090131864e95b321d41 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:33 +0000 Subject: KVM: arm64: Convert translation level parameter to s8 With the introduction of FEAT_LPA2, the Arm ARM adds a new level of translation, level -1, so levels can now be in the range [-1;3]. 3 is always the last level and the first level is determined based on the number of VA bits in use. Convert level variables to use a signed type in preparation for supporting this new level -1. Since the last level is always anchored at 3, and the first level varies to suit the number of VA/IPA bits, take the opportunity to replace KVM_PGTABLE_MAX_LEVELS with the 2 macros KVM_PGTABLE_FIRST_LEVEL and KVM_PGTABLE_LAST_LEVEL. This removes the assumption from the code that levels run from 0 to KVM_PGTABLE_MAX_LEVELS - 1, which will soon no longer be true. Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-9-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_emulate.h | 2 +- arch/arm64/include/asm/kvm_pgtable.h | 31 ++++++++-------- arch/arm64/include/asm/kvm_pkvm.h | 5 +-- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 ++-- arch/arm64/kvm/hyp/nvhe/mm.c | 4 +-- arch/arm64/kvm/hyp/nvhe/setup.c | 2 +- arch/arm64/kvm/hyp/pgtable.c | 66 +++++++++++++++++++---------------- arch/arm64/kvm/mmu.c | 16 +++++---- 8 files changed, 71 insertions(+), 61 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 78a550537b67..13fd9dbf2d1d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -409,7 +409,7 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vc return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE; } -static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) +static __always_inline s8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; } diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 69a2a87ecaf6..3253828e453d 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -11,7 +11,8 @@ #include #include -#define KVM_PGTABLE_MAX_LEVELS 4U +#define KVM_PGTABLE_FIRST_LEVEL 0 +#define KVM_PGTABLE_LAST_LEVEL 3 /* * The largest supported block sizes for KVM (no 52-bit PA support): @@ -20,9 +21,9 @@ * - 64K (level 2): 512MB */ #ifdef CONFIG_ARM64_4K_PAGES -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1 #else -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2 #endif #define kvm_lpa2_is_enabled() system_supports_lpa2() @@ -103,28 +104,28 @@ static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) return __phys_to_pfn(kvm_pte_to_phys(pte)); } -static inline u64 kvm_granule_shift(u32 level) +static inline u64 kvm_granule_shift(s8 level) { - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); } -static inline u64 kvm_granule_size(u32 level) +static inline u64 kvm_granule_size(s8 level) { return BIT(kvm_granule_shift(level)); } -static inline bool kvm_level_supports_block_mapping(u32 level) +static inline bool kvm_level_supports_block_mapping(s8 level) { return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } static inline u32 kvm_supported_block_sizes(void) { - u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; u32 r = 0; - for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) r |= BIT(kvm_granule_shift(level)); return r; @@ -169,7 +170,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); - void (*free_unlinked_table)(void *addr, u32 level); + void (*free_unlinked_table)(void *addr, s8 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -265,7 +266,7 @@ struct kvm_pgtable_visit_ctx { u64 start; u64 addr; u64 end; - u32 level; + s8 level; enum kvm_pgtable_walk_flags flags; }; @@ -368,7 +369,7 @@ static inline bool kvm_pgtable_walk_lock_held(void) */ struct kvm_pgtable { u32 ia_bits; - u32 start_level; + s8 start_level; kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; @@ -502,7 +503,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); /** * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. @@ -526,7 +527,7 @@ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *p * an ERR_PTR(error) on failure. */ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); @@ -752,7 +753,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level); + kvm_pte_t *ptep, s8 *level); /** * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index e46250a02017..ad9cfb5c1ff4 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -56,10 +56,11 @@ static inline unsigned long hyp_vm_table_pages(void) static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { - unsigned long total = 0, i; + unsigned long total = 0; + int i; /* Provision the worst case scenario */ - for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { + for (i = KVM_PGTABLE_FIRST_LEVEL; i <= KVM_PGTABLE_LAST_LEVEL; i++) { nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); total += nr_pages; } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8d0a5834e883..861c76021a25 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,7 +91,7 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_unlinked_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, s8 level) { kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } @@ -443,7 +443,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; - u32 level; + s8 level; int ret; hyp_assert_lock_held(&host_mmu.lock); @@ -462,7 +462,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; level++; - } while ((level < KVM_PGTABLE_MAX_LEVELS) && + } while ((level <= KVM_PGTABLE_LAST_LEVEL) && !(kvm_level_supports_block_mapping(level) && range_included(&cur, range))); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 65a7a186d7b2..b01a3d1078a8 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -260,7 +260,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); dsb(ish); isb(); } @@ -275,7 +275,7 @@ static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, { struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1) + if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; slot->addr = ctx->addr; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0d5e0a89ddce..bc58d1b515af 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -181,7 +181,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!kvm_pte_valid(ctx->old)) return 0; - if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; phys = kvm_pte_to_phys(ctx->old); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index ce9a58cb02fd..744bded18e99 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -101,7 +101,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, return IS_ALIGNED(ctx->addr, granule); } -static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; @@ -117,7 +117,7 @@ static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, @@ -127,9 +127,9 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_table(kvm_pte_t pte, u32 level) +static bool kvm_pte_table(kvm_pte_t pte, s8 level) { - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) @@ -157,11 +157,11 @@ static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops return pte; } -static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); - u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : - KVM_PTE_TYPE_BLOCK; + u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); @@ -206,11 +206,11 @@ static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pteref_t pteref, u32 level) + kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); @@ -275,12 +275,13 @@ out: } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; - if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { @@ -343,7 +344,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct leaf_walk_data { kvm_pte_t pte; - u32 level; + s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -358,7 +359,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level) + kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { @@ -471,7 +472,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); @@ -567,14 +568,19 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { - u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - + ARM64_HW_PGTABLE_LEVELS(va_bits); + + if (start_level < KVM_PGTABLE_FIRST_LEVEL || + start_level > KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; - pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; @@ -628,7 +634,7 @@ struct stage2_map_data { u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; - u8 lvls; + s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); @@ -911,7 +917,7 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = stage2_map_walker_phys_addr(ctx, data); - if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) + if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; return kvm_block_mapping_supported(ctx, phys); @@ -990,7 +996,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (ret != -E2BIG) return ret; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) @@ -1160,7 +1166,7 @@ struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; - u32 level; + s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -1203,7 +1209,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level, enum kvm_pgtable_walk_flags flags) + s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -1305,7 +1311,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot) { int ret; - u32 level; + s8 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) @@ -1358,7 +1364,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) } kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { @@ -1416,7 +1422,7 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry". */ -static int stage2_block_get_nr_page_tables(u32 level) +static int stage2_block_get_nr_page_tables(s8 level) { switch (level) { case 1: @@ -1427,7 +1433,7 @@ static int stage2_block_get_nr_page_tables(u32 level) return 0; default: WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || - level >= KVM_PGTABLE_MAX_LEVELS); + level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL; }; } @@ -1440,13 +1446,13 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu; kvm_pte_t pte = ctx->old, new, *childp; enum kvm_pgtable_prot prot; - u32 level = ctx->level; + s8 level = ctx->level; bool force_pte; int nr_pages; u64 phys; /* No huge-pages exist at the last level */ - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return 0; /* We only split valid block mappings */ @@ -1523,7 +1529,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); @@ -1546,7 +1552,7 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } @@ -1582,7 +1588,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d87c8fcc4c24..986a2e6fb900 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -223,12 +223,12 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); - u32 level = page_private(page); + s8 level = page_private(page); kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_unlinked_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, s8 level) { struct page *page = virt_to_page(addr); @@ -804,13 +804,13 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) struct kvm_pgtable pgt = { .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, - .start_level = (KVM_PGTABLE_MAX_LEVELS - - CONFIG_PGTABLE_LEVELS), + .start_level = (KVM_PGTABLE_LAST_LEVEL - + CONFIG_PGTABLE_LEVELS + 1), .mm_ops = &kvm_user_mm_ops, }; unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ - u32 level = ~0; + s8 level = S8_MAX; int ret; /* @@ -829,7 +829,9 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) * Not seeing an error, but not updating level? Something went * deeply wrong... */ - if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) + return -EFAULT; + if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) return -EFAULT; /* Oops, the userspace PTs are gone... Replay the fault */ @@ -1388,7 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); + s8 fault_level = kvm_vcpu_trap_get_fault_level(vcpu); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; -- cgit v1.2.3 From 0abc1b11a032199bb134fd25cd7ee0cdb26b7b03 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:34 +0000 Subject: KVM: arm64: Support up to 5 levels of translation in kvm_pgtable FEAT_LPA2 increases the maximum levels of translation from 4 to 5 for the 4KB page case, when IA is >48 bits. While we can still use 4 levels for stage2 translation in this case (due to stage2 allowing concatenated page tables for first level lookup), the same kvm_pgtable library is used for the hyp stage1 page tables and stage1 does not support concatenation. Therefore, modify the library to support up to 5 levels. Previous patches already laid the groundwork for this by refactoring code to work in terms of KVM_PGTABLE_FIRST_LEVEL and KVM_PGTABLE_LAST_LEVEL. So we just need to change these macros. The hardware sometimes encodes the new level differently from the others: One such place is when reading the level from the FSC field in the ESR_EL2 register. We never expect to see the lowest level (-1) here since the stage 2 page tables always use concatenated tables for first level lookup and therefore only use 4 levels of lookup. So we get away with just adding a comment to explain why we are not being careful about decoding level -1. For stage2 VTCR_EL2.SL2 is introduced to encode the new start level. However, since we always use concatenated page tables for first level look up at stage2 (and therefore we will never need the new extra level) we never touch this new field. Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-10-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_emulate.h | 10 ++++++++++ arch/arm64/include/asm/kvm_pgtable.h | 2 +- arch/arm64/kvm/hyp/pgtable.c | 9 +++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 13fd9dbf2d1d..d4f1e9cdd554 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -411,6 +411,16 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vc static __always_inline s8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) { + /* + * Note: With the introduction of FEAT_LPA2 an extra level of + * translation (level -1) is added. This level (obviously) doesn't + * follow the previous convention of encoding the 4 levels in the 2 LSBs + * of the FSC so this function breaks if the fault is for level -1. + * + * However, stage2 tables always use concatenated tables for first level + * lookup and therefore it is guaranteed that the level will be between + * 0 and 3, and this function continues to work. + */ return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; } diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3253828e453d..cfdf40f734b1 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -11,7 +11,7 @@ #include #include -#define KVM_PGTABLE_FIRST_LEVEL 0 +#define KVM_PGTABLE_FIRST_LEVEL -1 #define KVM_PGTABLE_LAST_LEVEL 3 /* diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 744bded18e99..c651df904fe3 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -645,6 +645,15 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; + + /* + * When LPA2 is enabled, the HW supports an extra level of translation + * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 + * to as an addition to SL0 to enable encoding this extra start level. + * However, since we always use concatenated pages for the first level + * lookup, we will never need this extra level and therefore do not need + * to touch SL2. + */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); #ifdef CONFIG_ARM64_HW_AFDBM -- cgit v1.2.3 From d782ac5b2ceebee5d374e13e990655d1a140d3a6 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:35 +0000 Subject: KVM: arm64: Allow guests with >48-bit IPA size on FEAT_LPA2 systems With all the page-table infrastructure in place, we can finally increase the maximum permisable IPA size to 52-bits on 4KB and 16KB page systems that have FEAT_LPA2. Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-11-ryan.roberts@arm.com --- arch/arm64/kvm/reset.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5bb4de162cab..68d1d05672bd 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -280,12 +280,11 @@ int __init kvm_set_ipa_limit(void) parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* - * IPA size beyond 48 bits could not be supported - * on either 4K or 16K page size. Hence let's cap - * it to 48 bits, in case it's reported as larger - * on the system. + * IPA size beyond 48 bits for 4K and 16K page size is only supported + * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 + * bits, in case it's reported as larger on the system. */ - if (PAGE_SIZE != SZ_64K) + if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* -- cgit v1.2.3 From 7b0dd9430cf0c1ae19645d2a6608a5fb57faffe4 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:12 +0800 Subject: KVM: x86: Consolidate flags for __linearize() Consolidate @write and @fetch of __linearize() into a set of flags so that additional flags can be added without needing more/new boolean parameters, to precisely identify the access type. No functional change intended. Signed-off-by: Binbin Wu Reviewed-by: Chao Gao Acked-by: Kai Huang Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-2-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 21 +++++++++++---------- arch/x86/kvm/kvm_emulate.h | 4 ++++ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2673cd5c46cb..87ee1802166a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -687,8 +687,8 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned *max_size, unsigned size, - bool write, bool fetch, - enum x86emul_mode mode, ulong *linear) + enum x86emul_mode mode, ulong *linear, + unsigned int flags) { struct desc_struct desc; bool usable; @@ -717,11 +717,11 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, if (!usable) goto bad; /* code segment in protected mode or read-only data segment */ - if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) - || !(desc.type & 2)) && write) + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) && + (flags & X86EMUL_F_WRITE)) goto bad; /* unreadable code segment */ - if (!fetch && (desc.type & 8) && !(desc.type & 2)) + if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); if (!(desc.type & 8) && (desc.type & 4)) { @@ -757,8 +757,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ulong *linear) { unsigned max_size; - return __linearize(ctxt, addr, &max_size, size, write, false, - ctxt->mode, linear); + return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear, + write ? X86EMUL_F_WRITE : 0); } static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) @@ -771,7 +771,8 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear, + X86EMUL_F_FETCH); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; @@ -907,8 +908,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * boundary check itself. Instead, we use max_size to check * against op_size. */ - rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode, - &linear); + rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear, + X86EMUL_F_FETCH); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index be7aeb9b8ea3..e24c8ac7b930 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -88,6 +88,10 @@ struct x86_instruction_info { #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ +/* x86-specific emulation flags */ +#define X86EMUL_F_WRITE BIT(0) +#define X86EMUL_F_FETCH BIT(1) + struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* -- cgit v1.2.3 From 3963c52df42231f72277cd138994ac94f1183d2b Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:14 +0800 Subject: KVM: x86: Add an emulation flag for implicit system access Add an emulation flag X86EMUL_F_IMPLICIT to identify implicit system access in instruction emulation. Don't bother wiring up any usage at this point, as Linear Address Space Separation (LASS) will be the first "real" consumer of the flag and LASS support will require dedicated hooks, i.e. there aren't any existing calls where passing X86EMUL_F_IMPLICIT is meaningful. Add the IMPLICIT flag even though there's no imminent usage so that Linear Address Masking (LAM) support can reference the flag to document that addresses for implicit accesses aren't untagged. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-4-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_emulate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index e24c8ac7b930..65fc7ef5ca3d 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -91,6 +91,7 @@ struct x86_instruction_info { /* x86-specific emulation flags */ #define X86EMUL_F_WRITE BIT(0) #define X86EMUL_F_FETCH BIT(1) +#define X86EMUL_F_IMPLICIT BIT(2) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); -- cgit v1.2.3 From 538ac9a92d669c4ccfc64739a32efab2793cea1d Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:15 +0800 Subject: KVM: x86: Add X86EMUL_F_INVLPG and pass it in em_invlpg() Add an emulation flag X86EMUL_F_INVLPG, which is used to identify an instruction that does TLB invalidation without true memory access. Only invlpg & invlpga implemented in emulator belong to this kind. invlpga doesn't need additional information for emulation. Just pass the flag to em_invlpg(). Linear Address Masking (LAM) and Linear Address Space Separation (LASS) don't apply to addresses that are inputs to TLB invalidation. The flag will be consumed to support LAM/LASS virtualization. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-5-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 4 +++- arch/x86/kvm/kvm_emulate.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 87ee1802166a..ceec8c5f9687 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3440,8 +3440,10 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; ulong linear; + unsigned int max_size; - rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); + rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode, + &linear, X86EMUL_F_INVLPG); if (rc == X86EMUL_CONTINUE) ctxt->ops->invlpg(ctxt, linear); /* Disable writeback. */ diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 65fc7ef5ca3d..8bd9b23543cc 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -92,6 +92,7 @@ struct x86_instruction_info { #define X86EMUL_F_WRITE BIT(0) #define X86EMUL_F_FETCH BIT(1) #define X86EMUL_F_IMPLICIT BIT(2) +#define X86EMUL_F_INVLPG BIT(3) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); -- cgit v1.2.3 From a130066f74008858ac425b7497d231742474a0ea Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:16 +0800 Subject: KVM: x86/mmu: Drop non-PA bits when getting GFN for guest's PGD Drop non-PA bits when getting GFN for guest's PGD with the maximum theoretical mask for guest MAXPHYADDR. Do it unconditionally because it's harmless for 32-bit guests, querying 64-bit mode would be more expensive, and for EPT the mask isn't tied to guest mode. Using PT_BASE_ADDR_MASK would be technically wrong (PAE paging has 64-bit elements _except_ for CR3, which has only 32 valid bits), it wouldn't matter in practice though. Opportunistically use GENMASK_ULL() to define __PT_BASE_ADDR_MASK. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-6-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmu_internal.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59b026b6ad2a..73070650b143 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3806,7 +3806,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) hpa_t root; root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); - root_gfn = root_pgd >> PAGE_SHIFT; + root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { mmu->root.hpa = kvm_mmu_get_dummy_root(); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b66a7d47e0e4..0669a8a668ca 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -13,6 +13,7 @@ #endif /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12) #define __PT_LEVEL_SHIFT(level, bits_per_level) \ (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) #define __PT_INDEX(address, level, bits_per_level) \ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c85255073f67..4d4e98fe4f35 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -62,7 +62,7 @@ #endif /* Common logic, but per-type values. These also need to be undefined. */ -#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) -- cgit v1.2.3 From 2c49db455ee27c72a680c9e4fad1c12433902ee3 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:17 +0800 Subject: KVM: x86: Add & use kvm_vcpu_is_legal_cr3() to check CR3's legality Add and use kvm_vcpu_is_legal_cr3() to check CR3's legality to provide a clear distinction between CR3 and GPA checks. This will allow exempting bits from kvm_vcpu_is_legal_cr3() without affecting general GPA checks, e.g. for upcoming features that will use high bits in CR3 for feature enabling. No functional change intended. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-7-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 0b90532b6e26..5fd3f2d1ca14 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -279,4 +279,9 @@ static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, vcpu->arch.governed_features.enabled); } +static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + return kvm_vcpu_is_legal_gpa(vcpu, cr3); +} + #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3fea8c47679e..90ca9489aab6 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -311,7 +311,7 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { if (CC(!(save->cr4 & X86_CR4_PAE)) || CC(!(save->cr0 & X86_CR0_PE)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3))) return false; } @@ -520,7 +520,7 @@ static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt, bool reload_pdptrs) { - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) + if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c5ec0ef51ff7..db61cf8e3128 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1085,7 +1085,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { + if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2912,7 +2912,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) + CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..3a648b6e632d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1284,7 +1284,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * stuff CR3, e.g. for RSM emulation, and there is no guarantee that * the current vCPU mode is accurate. */ - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) return 1; if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) @@ -11612,7 +11612,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; - if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3)) return false; } else { /* -- cgit v1.2.3 From 9c8021d4ae85f1531230fc33653e06e9f1fdb7f1 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:18 +0800 Subject: KVM: x86: Remove kvm_vcpu_is_illegal_gpa() Remove kvm_vcpu_is_illegal_gpa() and use !kvm_vcpu_is_legal_gpa() instead. The "illegal" helper actually predates the "legal" helper, the only reason the "illegal" variant wasn't removed by commit 4bda0e97868a ("KVM: x86: Add a helper to check for a legal GPA") was to avoid code churn. Now that CR3 has a dedicated helper, there are fewer callers, and so the code churn isn't that much of a deterrent. No functional change intended. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-8-binbin.wu@linux.intel.com [sean: provide a bit of history in the changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.h | 5 ----- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5fd3f2d1ca14..fa04a00090a2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,11 +47,6 @@ static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return !(gpa & vcpu->arch.reserved_gpa_bits); } -static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return !kvm_vcpu_is_legal_gpa(vcpu, gpa); -} - static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t alignment) { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index db61cf8e3128..51622878d6e4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2717,7 +2717,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) + if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40e3780d73ae..69072e418c73 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5782,7 +5782,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); -- cgit v1.2.3 From 37a41847b770c722e98ace72f3851fb49b360c08 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:19 +0800 Subject: KVM: x86: Introduce get_untagged_addr() in kvm_x86_ops and call it in emulator Introduce a new interface get_untagged_addr() to kvm_x86_ops to untag the metadata from linear address. Call the interface in linearization of instruction emulator for 64-bit mode. When enabled feature like Intel Linear Address Masking (LAM) or AMD Upper Address Ignore (UAI), linear addresses may be tagged with metadata that needs to be dropped prior to canonicality checks, i.e. the metadata is ignored. Introduce get_untagged_addr() to kvm_x86_ops to hide the vendor specific code, as sadly LAM and UAI have different semantics. Pass the emulator flags to allow vendor specific implementation to precisely identify the access type (LAM doesn't untag certain accesses). Signed-off-by: Binbin Wu Reviewed-by: Chao Gao Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-9-binbin.wu@linux.intel.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/kvm_emulate.h | 3 +++ arch/x86/kvm/x86.c | 10 ++++++++++ 5 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 26b628d84594..756791665117 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -135,6 +135,7 @@ KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +KVM_X86_OP_OPTIONAL(get_untagged_addr) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a565a2e70f30..8d042f27d644 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1762,6 +1762,8 @@ struct kvm_x86_ops { * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ceec8c5f9687..e223043ef5b2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -701,7 +701,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: - *linear = la; + *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags); va_bits = ctxt_virt_addr_bits(ctxt); if (!__is_canonical_address(la, va_bits)) goto bad; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8bd9b23543cc..e6d149825169 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -230,6 +230,9 @@ struct x86_emulate_ops { int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); + + gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, + unsigned int flags); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a648b6e632d..557872c0331a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8445,6 +8445,15 @@ static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) kvm_vm_bugged(kvm); } +static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + if (!kvm_x86_ops.get_untagged_addr) + return addr; + + return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags); +} + static const struct x86_emulate_ops emulate_ops = { .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, @@ -8489,6 +8498,7 @@ static const struct x86_emulate_ops emulate_ops = { .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, + .get_untagged_addr = emulator_get_untagged_addr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) -- cgit v1.2.3 From b39bd520a60c667a339e315ce7a3de2f7178f6e3 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:20 +0800 Subject: KVM: x86: Untag addresses for LAM emulation where applicable Stub in vmx_get_untagged_addr() and wire up calls from the emulator (via get_untagged_addr()) and "direct" calls from various VM-Exit handlers in VMX where LAM untagging is supposed to be applied. Defer implementing the guts of vmx_get_untagged_addr() to future patches purely to make the changes easier to consume. LAM is active only for 64-bit linear addresses and several types of accesses are exempted. - Cases need to untag address (handled in get_vmx_mem_address()) Operand(s) of VMX instructions and INVPCID. Operand(s) of SGX ENCLS. - Cases LAM doesn't apply to (no change needed) Operand of INVLPG. Linear address in INVPCID descriptor. Linear address in INVVPID descriptor. BASEADDR specified in SECS of ECREATE. Note: - LAM doesn't apply to write to control registers or MSRs - LAM masking is applied before walking page tables, i.e. the faulting linear address in CR2 doesn't contain the metadata. - The guest linear address saved in VMCS doesn't contain metadata. Signed-off-by: Binbin Wu Reviewed-by: Chao Gao Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-10-binbin.wu@linux.intel.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 +++++ arch/x86/kvm/vmx/sgx.c | 1 + arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/vmx/vmx.h | 2 ++ arch/x86/kvm/x86.c | 4 ++++ 5 files changed, 19 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 51622878d6e4..4ba46e1b29d2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4980,6 +4980,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, else *ret = off; + *ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! @@ -5797,6 +5798,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if (!operand.vpid || is_noncanonical_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 3e822e582497..6fef01e0536e 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -37,6 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, if (!IS_ALIGNED(*gva, alignment)) { fault = true; } else if (likely(is_64_bit_mode(vcpu))) { + *gva = vmx_get_untagged_addr(vcpu, *gva, 0); fault = is_noncanonical_address(*gva, vcpu); } else { *gva &= 0xffffffff; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 69072e418c73..d7ff06d77720 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8205,6 +8205,11 @@ static void vmx_vm_destroy(struct kvm *kvm) free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); } +gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) +{ + return gva; +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -8345,6 +8350,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .complete_emulated_msr = kvm_complete_insn_gp, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, + + .get_untagged_addr = vmx_get_untagged_addr, }; static unsigned int vmx_handle_intel_pt_intr(void) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c2130d2c8e24..45cee1a8bc0a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -420,6 +420,8 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); +gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 557872c0331a..ecfe97aa35c2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13563,6 +13563,10 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) switch (type) { case INVPCID_TYPE_INDIV_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if ((!pcid_enabled && (operand.pcid != 0)) || is_noncanonical_address(operand.gla, vcpu)) { kvm_inject_gp(vcpu, 0); -- cgit v1.2.3 From 93d1c9f498a7505e0e0a0198f3b3d7f97fcc5fa6 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Wed, 13 Sep 2023 20:42:21 +0800 Subject: KVM: x86: Virtualize LAM for supervisor pointer Add support to allow guests to set the new CR4 control bit for LAM and add implementation to get untagged address for supervisor pointers. LAM modifies the canonicality check applied to 64-bit linear addresses for data accesses, allowing software to use of the untranslated address bits for metadata and masks the metadata bits before using them as linear addresses to access memory. LAM uses CR4.LAM_SUP (bit 28) to configure and enable LAM for supervisor pointers. It also changes VMENTER to allow the bit to be set in VMCS's HOST_CR4 and GUEST_CR4 to support virtualization. Note CR4.LAM_SUP is allowed to be set even not in 64-bit mode, but it will not take effect since LAM only applies to 64-bit linear addresses. Move CR4.LAM_SUP out of CR4_RESERVED_BITS, its reservation depends on vcpu supporting LAM or not. Leave it intercepted to prevent guest from setting the bit if LAM is not exposed to guest as well as to avoid vmread every time when KVM fetches its value, with the expectation that guest won't toggle the bit frequently. Set CR4.LAM_SUP bit in the emulated IA32_VMX_CR4_FIXED1 MSR for guests to allow guests to enable LAM for supervisor pointers in nested VMX operation. Hardware is not required to do TLB flush when CR4.LAM_SUP toggled, KVM doesn't need to emulate TLB flush based on it. There's no other features or vmx_exec_controls connection, and no other code needed in {kvm,vmx}_set_cr4(). Skip address untag for instruction fetches (which includes branch targets), operand of INVLPG instructions, and implicit system accesses, all of which are not subject to untagging. Note, get_untagged_addr() isn't invoked for implicit system accesses as there is no reason to do so, but check the flag anyways for documentation purposes. Signed-off-by: Robert Hoo Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu Reviewed-by: Chao Gao Reviewed-by: Kai Huang Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-11-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/vmx/vmx.c | 39 ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.h | 2 ++ 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8d042f27d644..f96988f283d5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -133,7 +133,8 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ + | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d7ff06d77720..3bbe5a9362a9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7673,6 +7673,9 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); + cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); + #undef cr4_fixed1_update } @@ -8205,9 +8208,43 @@ static void vmx_vm_destroy(struct kvm *kvm) free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); } +/* + * Note, the SDM states that the linear address is masked *after* the modified + * canonicality check, whereas KVM masks (untags) the address and then performs + * a "normal" canonicality check. Functionally, the two methods are identical, + * and when the masking occurs relative to the canonicality check isn't visible + * to software, i.e. KVM's behavior doesn't violate the SDM. + */ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) { - return gva; + int lam_bit; + + if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) + return gva; + + if (!is_64_bit_mode(vcpu)) + return gva; + + /* + * Bit 63 determines if the address should be treated as user address + * or a supervisor address. + */ + if (!(gva & BIT_ULL(63))) { + /* KVM doesn't yet virtualize LAM_U{48,57}. */ + return gva; + } else { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) + return gva; + + lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; + } + + /* + * Untag the address by sign-extending the lam_bit, but NOT to bit 63. + * Bit 63 is retained from the raw virtual address so that untagging + * doesn't change a user access to a supervisor access, and vice versa. + */ + return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } static struct kvm_x86_ops vmx_x86_ops __initdata = { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5184fde1dc54..2f7e19166658 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -530,6 +530,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits |= X86_CR4_VMXE; \ if (!__cpu_has(__c, X86_FEATURE_PCID)) \ __reserved_bits |= X86_CR4_PCIDE; \ + if (!__cpu_has(__c, X86_FEATURE_LAM)) \ + __reserved_bits |= X86_CR4_LAM_SUP; \ __reserved_bits; \ }) -- cgit v1.2.3 From 3098e6eca88e543ea0d190d1fa72b1c047bb3e7d Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Wed, 13 Sep 2023 20:42:22 +0800 Subject: KVM: x86: Virtualize LAM for user pointer Add support to allow guests to set the new CR3 control bits for Linear Address Masking (LAM) and add implementation to get untagged address for user pointers. LAM modifies the canonical check for 64-bit linear addresses, allowing software to use the masked/ignored address bits for metadata. Hardware masks off the metadata bits before using the linear addresses to access memory. LAM uses two new CR3 non-address bits, LAM_U48 (bit 62) and LAM_U57 (bit 61), to configure LAM for user pointers. LAM also changes VMENTER to allow both bits to be set in VMCS's HOST_CR3 and GUEST_CR3 for virtualization. When EPT is on, CR3 is not trapped by KVM and it's up to the guest to set any of the two LAM control bits. However, when EPT is off, the actual CR3 used by the guest is generated from the shadow MMU root which is different from the CR3 that is *set* by the guest, and KVM needs to manually apply any active control bits to VMCS's GUEST_CR3 based on the cached CR3 *seen* by the guest. KVM manually checks guest's CR3 to make sure it points to a valid guest physical address (i.e. to support smaller MAXPHYSADDR in the guest). Extend this check to allow the two LAM control bits to be set. After check, LAM bits of guest CR3 will be stripped off to extract guest physical address. In case of nested, for a guest which supports LAM, both VMCS12's HOST_CR3 and GUEST_CR3 are allowed to have the new LAM control bits set, i.e. when L0 enters L1 to emulate a VMEXIT from L2 to L1 or when L0 enters L2 directly. KVM also manually checks VMCS12's HOST_CR3 and GUEST_CR3 being valid physical address. Extend such check to allow the new LAM control bits too. Note, LAM doesn't have a global control bit to turn on/off LAM completely, but purely depends on hardware's CPUID to determine it can be enabled or not. That means, when EPT is on, even when KVM doesn't expose LAM to guest, the guest can still set LAM control bits in CR3 w/o causing problem. This is an unfortunate virtualization hole. KVM could choose to intercept CR3 in this case and inject fault but this would hurt performance when running a normal VM w/o LAM support. This is undesirable. Just choose to let the guest do such illegal thing as the worst case is guest being killed when KVM eventually find out such illegal behaviour and that the guest is misbehaving. Suggested-by: Sean Christopherson Signed-off-by: Robert Hoo Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Chao Gao Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-12-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.h | 4 ++++ arch/x86/kvm/mmu.h | 9 +++++++++ arch/x86/kvm/vmx/vmx.c | 12 +++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index fa04a00090a2..e4bd59c15a2c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -276,6 +276,10 @@ static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + if (kvm_cpu_cap_has(X86_FEATURE_LAM) && + guest_cpuid_has(vcpu, X86_FEATURE_LAM)) + cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); + return kvm_vcpu_is_legal_gpa(vcpu, cr3); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bb8c86eefac0..580e2414b88c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -146,6 +146,15 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); } +static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu) +{ + if (!kvm_cpu_cap_has(X86_FEATURE_LAM) || + !guest_cpuid_has(vcpu, X86_FEATURE_LAM)) + return 0; + + return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57); +} + static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { u64 root_hpa = vcpu->arch.mmu->root.hpa; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3bbe5a9362a9..3825ab316be9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3400,7 +3400,8 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { - guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | + kvm_get_active_cr3_lam_bits(vcpu); } if (update_guest_cr3) @@ -8218,6 +8219,7 @@ static void vmx_vm_destroy(struct kvm *kvm) gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) { int lam_bit; + unsigned long cr3_bits; if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) return gva; @@ -8230,8 +8232,12 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags * or a supervisor address. */ if (!(gva & BIT_ULL(63))) { - /* KVM doesn't yet virtualize LAM_U{48,57}. */ - return gva; + cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); + if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) + return gva; + + /* LAM_U48 is ignored if LAM_U57 is set. */ + lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; } else { if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) return gva; -- cgit v1.2.3 From 703d794cb8cb28c07b22c1c845f5c4d4c419aff7 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Wed, 13 Sep 2023 20:42:23 +0800 Subject: KVM: x86: Advertise and enable LAM (user and supervisor) LAM is enumerated by CPUID.7.1:EAX.LAM[bit 26]. Advertise the feature to userspace and enable it as the final step after the LAM virtualization support for supervisor and user pointers. SGX LAM support is not advertised yet. SGX LAM support is enumerated in SGX's own CPUID and there's no hard requirement that it must be supported when LAM is reported in CPUID leaf 0x7. Signed-off-by: Robert Hoo Signed-off-by: Binbin Wu Reviewed-by: Jingqi Liu Reviewed-by: Chao Gao Reviewed-by: Kai Huang Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-13-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..40d68fef748a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -671,7 +671,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_7_1_EAX, F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | - F(AMX_FP16) | F(AVX_IFMA) + F(AMX_FP16) | F(AVX_IFMA) | F(LAM) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, -- cgit v1.2.3 From 183bdd161c2b773a62f01d1c030f5a3a5b7c33b5 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Wed, 13 Sep 2023 20:42:24 +0800 Subject: KVM: x86: Use KVM-governed feature framework to track "LAM enabled" Use the governed feature framework to track if Linear Address Masking (LAM) is "enabled", i.e. if LAM can be used by the guest. Using the framework to avoid the relative expensive call guest_cpuid_has() during cr3 and vmexit handling paths for LAM. No functional change intended. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Link: https://lore.kernel.org/r/20230913124227.12574-14-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.h | 3 +-- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/mmu.h | 3 +-- arch/x86/kvm/vmx/vmx.c | 1 + 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index e4bd59c15a2c..856e3037e74f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -276,8 +276,7 @@ static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (kvm_cpu_cap_has(X86_FEATURE_LAM) && - guest_cpuid_has(vcpu, X86_FEATURE_LAM)) + if (guest_can_use(vcpu, X86_FEATURE_LAM)) cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); return kvm_vcpu_is_legal_gpa(vcpu, cr3); diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 423a73395c10..ad463b1ed4e4 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -16,6 +16,7 @@ KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) KVM_GOVERNED_X86_FEATURE(VGIF) KVM_GOVERNED_X86_FEATURE(VNMI) +KVM_GOVERNED_X86_FEATURE(LAM) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 580e2414b88c..60f21bb4c27b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -148,8 +148,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu) { - if (!kvm_cpu_cap_has(X86_FEATURE_LAM) || - !guest_cpuid_has(vcpu, X86_FEATURE_LAM)) + if (!guest_can_use(vcpu, X86_FEATURE_LAM)) return 0; return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3825ab316be9..d30df9b3fe3e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7763,6 +7763,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); vmx_setup_uret_msrs(vmx); -- cgit v1.2.3 From 0277022a77a56f8251e8cf8d25e9308478e79ea5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:23:25 -0700 Subject: KVM: x86/mmu: Declare flush_remote_tlbs{_range}() hooks iff HYPERV!=n Declare the kvm_x86_ops hooks used to wire up paravirt TLB flushes when running under Hyper-V if and only if CONFIG_HYPERV!=n. Wrapping yet more code with IS_ENABLED(CONFIG_HYPERV) eliminates a handful of conditional branches, and makes it super obvious why the hooks *might* be valid. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231018192325.1893896-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 ++ arch/x86/include/asm/kvm_host.h | 12 ++++++++++++ arch/x86/kvm/mmu/mmu.c | 12 ++++-------- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 26b628d84594..f482216bbdb8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,8 +55,10 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) +#if IS_ENABLED(CONFIG_HYPERV) KVM_X86_OP_OPTIONAL(flush_remote_tlbs) KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) +#endif KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a565a2e70f30..5e5e9e0abd4a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1614,9 +1614,11 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); +#if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +#endif /* * Flush any TLB entries associated with the given GVA. @@ -1825,6 +1827,7 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); +#if IS_ENABLED(CONFIG_HYPERV) #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { @@ -1836,6 +1839,15 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) } #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, + u64 nr_pages) +{ + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; + + return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); +} +#endif /* CONFIG_HYPERV */ #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59b026b6ad2a..8531480e5da4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -271,15 +271,11 @@ static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, static inline bool kvm_available_flush_remote_tlbs_range(void) { +#if IS_ENABLED(CONFIG_HYPERV) return kvm_x86_ops.flush_remote_tlbs_range; -} - -int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) -{ - if (!kvm_x86_ops.flush_remote_tlbs_range) - return -EOPNOTSUPP; - - return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); +#else + return false; +#endif } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); -- cgit v1.2.3 From 11e5ea5242e38d44fcede879473566bb6d68f954 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 28 Nov 2023 15:04:01 +0100 Subject: KVM: arm64: Use helpers to classify exception types reported via ESR Currently, we rely on the fact that exceptions can be trivially classified by applying a mask/value pair to the syndrome value reported via the ESR register, but this will no longer be true once we enable support for 5 level paging. So introduce a couple of helpers that encapsulate this mask/value pair matching, and wire them up in the code. No functional change intended, the actual handling of translation level -1 will be added in a subsequent patch. Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Ryan Roberts Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland [maz: folded in changes suggested by Mark] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231128140400.3132145-2-ardb@google.com --- arch/arm64/include/asm/esr.h | 15 ++++++++++++++ arch/arm64/include/asm/kvm_emulate.h | 36 +++++++++++++++------------------ arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- arch/arm64/kvm/mmu.c | 35 ++++++++++++++++---------------- 5 files changed, 50 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index ae35939f395b..353fe08546cf 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -392,6 +392,21 @@ static inline bool esr_is_data_abort(unsigned long esr) return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; } +static inline bool esr_fsc_is_translation_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; +} + +static inline bool esr_fsc_is_permission_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM; +} + +static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS; +} + const char *esr_get_class_string(unsigned long esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index d4f1e9cdd554..31f13e7d339b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -404,24 +404,25 @@ static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; } -static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE; + return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu)); } -static __always_inline s8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu) { - /* - * Note: With the introduction of FEAT_LPA2 an extra level of - * translation (level -1) is added. This level (obviously) doesn't - * follow the previous convention of encoding the 4 levels in the 2 LSBs - * of the FSC so this function breaks if the fault is for level -1. - * - * However, stage2 tables always use concatenated tables for first level - * lookup and therefore it is guaranteed that the level will be between - * 0 and 3, and this function continues to work. - */ - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; + return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu)); +} + +static inline +u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu) +{ + unsigned long esr = kvm_vcpu_get_esr(vcpu); + + BUG_ON(!esr_fsc_is_permission_fault(esr)); + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) @@ -464,12 +465,7 @@ static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) * first), then a permission fault to allow the flags * to be set. */ - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { - case ESR_ELx_FSC_PERM: - return true; - default: - return false; - } + return kvm_vcpu_trap_is_permission_fault(vcpu); } if (kvm_vcpu_trap_is_iabt(vcpu)) diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9ddcfe2c3e57..9e13c1bc2ad5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { + esr_fsc_is_permission_fault(esr))) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f99d8af0b9af..f44fb11307fb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -591,7 +591,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && + valid = kvm_vcpu_trap_is_translation_fault(vcpu) && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 986a2e6fb900..d14504821b79 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1376,7 +1376,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) + bool fault_is_perm) { int ret = 0; bool write_fault, writable, force_pte = false; @@ -1390,17 +1390,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - s8 fault_level = kvm_vcpu_trap_get_fault_level(vcpu); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; - fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); + if (fault_is_perm) + fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { + if (fault_is_perm && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1411,8 +1411,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { + if (!fault_is_perm || (logging_active && write_fault)) { ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); if (ret) @@ -1529,8 +1528,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { - if (fault_status == ESR_ELx_FSC_PERM && - fault_granule > PAGE_SIZE) + if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, @@ -1543,7 +1541,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } } - if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { + if (!fault_is_perm && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1569,7 +1567,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) + if (fault_is_perm && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, @@ -1620,7 +1618,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { - unsigned long fault_status; + unsigned long esr; phys_addr_t fault_ipa; struct kvm_memory_slot *memslot; unsigned long hva; @@ -1628,12 +1626,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) gfn_t gfn; int ret, idx; - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + esr = kvm_vcpu_get_esr(vcpu); fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == ESR_ELx_FSC_FAULT) { + if (esr_fsc_is_permission_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1668,9 +1666,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != ESR_ELx_FSC_FAULT && - fault_status != ESR_ELx_FSC_PERM && - fault_status != ESR_ELx_FSC_ACCESS) { + if (!esr_fsc_is_translation_fault(esr) && + !esr_fsc_is_permission_fault(esr) && + !esr_fsc_is_access_flag_fault(esr)) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1732,13 +1730,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); - if (fault_status == ESR_ELx_FSC_ACCESS) { + if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: -- cgit v1.2.3 From 75bedc1ee90bd54ae8c5ab8be72506f8c5959584 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 08:19:06 -0700 Subject: KVM: x86: Turn off KVM_WERROR by default for all configs Don't enable KVM_WERROR by default for x86-64 builds as KVM's one-off -Werror enabling is *mostly* superseded by the kernel-wide WERROR, and enabling KVM_WERROR by default can cause problems for developers working on other subsystems. E.g. subsystems that have a "zero W=1 regressions" rule can inadvertently build KVM with -Werror and W=1, and end up with build failures that are completely uninteresting to the developer (W=1 is prone to false positives, especially on older compilers). Keep KVM_WERROR as there are combinations where enabling WERROR isn't feasible, e.g. the default FRAME_WARN=1024 on i386 builds generates a non-zero number of warnings and thus errors, and there are far too many warnings throughout the kernel to enable WERROR with W=1 (building KVM with -Werror is desirable (with a sane compiler) as W=1 does generate useful warnings). Opportunistically drop the dependency on !COMPILE_TEST as it's completely meaningless (it was copied from i195's -Werror Kconfig), as the kernel's WERROR is explicitly *enabled* for COMPILE_TEST=y kernel's, i.e. enabling -Werror is obviosly not dependent on COMPILE_TEST=n. Reported-by: Jakub Kicinski Link: https://lore.kernel.org/all/20231006205415.3501535-1-kuba@kernel.org Link: https://lore.kernel.org/r/20231018151906.1841689-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c1716e83d176..4d086fe67a7c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -65,13 +65,13 @@ config KVM config KVM_WERROR bool "Compile KVM with -Werror" - # KASAN may cause the build to fail due to larger frames - default y if X86_64 && !KASAN - # We use the dependency on !COMPILE_TEST to not be enabled - # blindly in allmodconfig or allyesconfig configurations - depends on KVM - depends on (X86_64 && !KASAN) || !COMPILE_TEST - depends on EXPERT + # Disallow KVM's -Werror if KASAN is enabled, e.g. to guard against + # randomized configs from selecting KVM_WERROR=y, which doesn't play + # nice with KASAN. KASAN builds generates warnings for the default + # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. + # Building KVM with -Werror and KASAN is still doable via enabling + # the kernel-wide WERROR=y. + depends on KVM && EXPERT && !KASAN help Add -Werror to the build flags for KVM. -- cgit v1.2.3 From eefe5e6682099445f77f2d97d4c525f9ac9d9b07 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 23 Oct 2023 17:16:35 -0700 Subject: KVM: x86: Advertise CPUID.(EAX=7,ECX=2):EDX[5:0] to userspace The low five bits {INTEL_PSFD, IPRED_CTRL, RRSBA_CTRL, DDPD_U, BHI_CTRL} advertise the availability of specific bits in IA32_SPEC_CTRL. Since KVM dynamically determines the legal IA32_SPEC_CTRL bits for the underlying hardware, the hard work has already been done. Just let userspace know that a guest can use these IA32_SPEC_CTRL bits. The sixth bit (MCDT_NO) states that the processor does not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior. This is an inherent property of the physical processor that is inherited by the virtual CPU. Pass that information on to userspace. Signed-off-by: Jim Mattson Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20231024001636.890236-1-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 21 ++++++++++++++++++--- arch/x86/kvm/reverse_cpuid.h | 12 ++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..1811a9ddfe1d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -679,6 +679,11 @@ void kvm_set_cpu_caps(void) F(AMX_COMPLEX) ); + kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, + F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) | + F(BHI_CTRL) | F(MCDT_NO) + ); + kvm_cpu_cap_mask(CPUID_D_1_EAX, F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); @@ -960,13 +965,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* function 7 has additional index. */ case 7: - entry->eax = min(entry->eax, 1u); + max_idx = entry->eax = min(entry->eax, 2u); cpuid_entry_override(entry, CPUID_7_0_EBX); cpuid_entry_override(entry, CPUID_7_ECX); cpuid_entry_override(entry, CPUID_7_EDX); - /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */ - if (entry->eax == 1) { + /* KVM only supports up to 0x7.2, capped above via min(). */ + if (max_idx >= 1) { entry = do_host_cpuid(array, function, 1); if (!entry) goto out; @@ -976,6 +981,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = 0; entry->ecx = 0; } + if (max_idx >= 2) { + entry = do_host_cpuid(array, function, 2); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_7_2_EDX); + entry->ecx = 0; + entry->ebx = 0; + entry->eax = 0; + } break; case 0xa: { /* Architectural Performance Monitoring */ union cpuid10_eax eax; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index b81650678375..17007016d8b5 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -16,6 +16,7 @@ enum kvm_only_cpuid_leafs { CPUID_7_1_EDX, CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, + CPUID_7_2_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -46,6 +47,14 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ +#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) +#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) +#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) +#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) +#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -80,6 +89,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, + [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, }; /* @@ -116,6 +126,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_CONSTANT_TSC; else if (x86_feature == X86_FEATURE_PERFMON_V2) return KVM_X86_FEATURE_PERFMON_V2; + else if (x86_feature == X86_FEATURE_RRSBA_CTRL) + return KVM_X86_FEATURE_RRSBA_CTRL; return x86_feature; } -- cgit v1.2.3 From 80c883db87d9ffe2d685e91ba07a087b1c246c78 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 23 Oct 2023 17:16:36 -0700 Subject: KVM: x86: Use a switch statement and macros in __feature_translate() Use a switch statement with macro-generated case statements to handle translating feature flags in order to reduce the probability of runtime errors due to copy+paste goofs, to make compile-time errors easier to debug, and to make the code more readable. E.g. the compiler won't directly generate an error for duplicate if statements if (x86_feature == X86_FEATURE_SGX1) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX1; and so instead reverse_cpuid_check() will fail due to the untranslated entry pointing at a Linux-defined leaf, which provides practically no hint as to what is broken arch/x86/kvm/reverse_cpuid.h:108:2: error: call to __compiletime_assert_450 declared with 'error' attribute: BUILD_BUG_ON failed: x86_leaf == CPUID_LNX_4 BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); ^ whereas duplicate case statements very explicitly point at the offending code: arch/x86/kvm/reverse_cpuid.h:125:2: error: duplicate case value '361' KVM_X86_TRANSLATE_FEATURE(SGX2); ^ arch/x86/kvm/reverse_cpuid.h:124:2: error: duplicate case value '360' KVM_X86_TRANSLATE_FEATURE(SGX1); ^ And without macros, the opposite type of copy+paste goof doesn't generate any error at compile-time, e.g. this yields no complaints: case X86_FEATURE_SGX1: return KVM_X86_FEATURE_SGX1; case X86_FEATURE_SGX2: return KVM_X86_FEATURE_SGX1; Note, __feature_translate() is forcibly inlined and the feature is known at compile-time, so the code generation between an if-elif sequence and a switch statement should be identical. Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20231024001636.890236-2-jmattson@google.com [sean: use a macro, rewrite changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/reverse_cpuid.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 17007016d8b5..aadefcaa9561 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -116,20 +116,19 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_translate(int x86_feature) { - if (x86_feature == X86_FEATURE_SGX1) - return KVM_X86_FEATURE_SGX1; - else if (x86_feature == X86_FEATURE_SGX2) - return KVM_X86_FEATURE_SGX2; - else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) - return KVM_X86_FEATURE_SGX_EDECCSSA; - else if (x86_feature == X86_FEATURE_CONSTANT_TSC) - return KVM_X86_FEATURE_CONSTANT_TSC; - else if (x86_feature == X86_FEATURE_PERFMON_V2) - return KVM_X86_FEATURE_PERFMON_V2; - else if (x86_feature == X86_FEATURE_RRSBA_CTRL) - return KVM_X86_FEATURE_RRSBA_CTRL; - - return x86_feature; +#define KVM_X86_TRANSLATE_FEATURE(f) \ + case X86_FEATURE_##f: return KVM_X86_FEATURE_##f + + switch (x86_feature) { + KVM_X86_TRANSLATE_FEATURE(SGX1); + KVM_X86_TRANSLATE_FEATURE(SGX2); + KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA); + KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); + KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); + KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + default: + return x86_feature; + } } static __always_inline u32 __feature_leaf(int x86_feature) -- cgit v1.2.3 From c52ffadc65e28ab461fd055e9991e8d8106a0056 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:56:38 -0700 Subject: KVM: x86: Don't unnecessarily force masterclock update on vCPU hotplug Don't force a masterclock update when a vCPU synchronizes to the current TSC generation, e.g. when userspace hotplugs a pre-created vCPU into the VM. Unnecessarily updating the masterclock is undesirable as it can cause kvmclock's time to jump, which is particularly painful on systems with a stable TSC as kvmclock _should_ be fully reliable on such systems. The unexpected time jumps are due to differences in the TSC=>nanoseconds conversion algorithms between kvmclock and the host's CLOCK_MONOTONIC_RAW (the pvclock algorithm is inherently lossy). When updating the masterclock, KVM refreshes the "base", i.e. moves the elapsed time since the last update from the kvmclock/pvclock algorithm to the CLOCK_MONOTONIC_RAW algorithm. Synchronizing kvmclock with CLOCK_MONOTONIC_RAW is the lesser of evils when the TSC is unstable, but adds no real value when the TSC is stable. Prior to commit 7f187922ddf6 ("KVM: x86: update masterclock values on TSC writes"), KVM did NOT force an update when synchronizing a vCPU to the current generation. commit 7f187922ddf6b67f2999a76dcb71663097b75497 Author: Marcelo Tosatti Date: Tue Nov 4 21:30:44 2014 -0200 KVM: x86: update masterclock values on TSC writes When the guest writes to the TSC, the masterclock TSC copy must be updated as well along with the TSC_OFFSET update, otherwise a negative tsc_timestamp is calculated at kvm_guest_time_update. Once "if (!vcpus_matched && ka->use_master_clock)" is simplified to "if (ka->use_master_clock)", the corresponding "if (!ka->use_master_clock)" becomes redundant, so remove the do_request boolean and collapse everything into a single condition. Before that, KVM only re-synced the masterclock if the masterclock was enabled or disabled Note, at the time of the above commit, VMX synchronized TSC on *guest* writes to MSR_IA32_TSC: case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; which is why the changelog specifically says "guest writes", but the bug that was being fixed wasn't unique to guest write, i.e. a TSC write from the host would suffer the same problem. So even though KVM stopped synchronizing on guest writes as of commit 0c899c25d754 ("KVM: x86: do not attempt TSC synchronization on guest writes"), simply reverting commit 7f187922ddf6 is not an option. Figuring out how a negative tsc_timestamp could be computed requires a bit more sleuthing. In kvm_write_tsc() (at the time), except for KVM's "less than 1 second" hack, KVM snapshotted the vCPU's current TSC *and* the current time in nanoseconds, where kvm->arch.cur_tsc_nsec is the current host kernel time in nanoseconds: ns = get_kernel_ns(); ... if (usdiff < USEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { ... } else { /* * We split periods of matched TSC writes into generations. * For each generation, we track the original measured * nanosecond time, offset, and write, so if TSCs are in * sync, we can match exact offset, and if not, we can match * exact software computation in compute_guest_tsc() * * These values are tracked in kvm->arch.cur_xxx variables. */ kvm->arch.cur_tsc_generation++; kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; pr_debug("kvm: new tsc generation %llu, clock %llu\n", kvm->arch.cur_tsc_generation, data); } ... /* Keep track of which generation this VCPU has synchronized to */ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; Note that the above creates a new generation and sets "matched" to false! But because kvm_track_tsc_matching() looks for matched+1, i.e. doesn't require the vCPU that creates the new generation to match itself, KVM would immediately compute vcpus_matched as true for VMs with a single vCPU. As a result, KVM would skip the masterlock update, even though a new TSC generation was created: vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)); if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) if (!ka->use_master_clock) do_request = 1; if (!vcpus_matched && ka->use_master_clock) do_request = 1; if (do_request) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); On hardware without TSC scaling support, vcpu->tsc_catchup is set to true if the guest TSC frequency is faster than the host TSC frequency, even if the TSC is otherwise stable. And for that mode, kvm_guest_time_update(), by way of compute_guest_tsc(), uses vcpu->arch.this_tsc_nsec, a.k.a. the kernel time at the last TSC write, to compute the guest TSC relative to kernel time: static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.this_tsc_write; return tsc; } Except the "kernel_ns" passed to compute_guest_tsc() isn't the current kernel time, it's the masterclock snapshot! spin_lock(&ka->pvclock_gtod_sync_lock); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } spin_unlock(&ka->pvclock_gtod_sync_lock); if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } And so when KVM skips the masterclock update after a TSC write, i.e. after a new TSC generation is started, the "kernel_ns-vcpu->arch.this_tsc_nsec" is *guaranteed* to generate a negative value, because this_tsc_nsec was captured after ka->master_kernel_ns. Forcing a masterclock update essentially fudged around that problem, but in a heavy handed way that introduced undesirable side effects, i.e. unnecessarily forces a masterclock update when a new vCPU joins the party via hotplug. Note, KVM forces masterclock updates in other weird ways that are also likely unnecessary, e.g. when establishing a new Xen shared info page and when userspace creates a brand new vCPU. But the Xen thing is firmly a separate mess, and there are no known userspace VMMs that utilize kvmclock *and* create new vCPUs after the VM is up and running. I.e. the other issues are future problems. Reported-by: Dongli Zhang Closes: https://lore.kernel.org/all/20230926230649.67852-1-dongli.zhang@oracle.com Fixes: 7f187922ddf6 ("KVM: x86: update masterclock values on TSC writes") Cc: David Woodhouse Reviewed-by: Dongli Zhang Tested-by: Dongli Zhang Link: https://lore.kernel.org/r/20231018195638.1898375-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..99ec48203667 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2510,26 +2510,29 @@ static inline int gtod_is_based_on_tsc(int mode) } #endif -static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) { #ifdef CONFIG_X86_64 - bool vcpus_matched; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&vcpu->kvm->online_vcpus)); + /* + * To use the masterclock, the host clocksource must be based on TSC + * and all vCPUs must have matching TSCs. Note, the count for matching + * vCPUs doesn't include the reference vCPU, hence "+1". + */ + bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)) && + gtod_is_based_on_tsc(gtod->clock.vclock_mode); /* - * Once the masterclock is enabled, always perform request in - * order to update it. - * - * In order to enable masterclock, the host clocksource must be TSC - * and the vcpus need to have matched TSCs. When that happens, - * perform request to enable masterclock. + * Request a masterclock update if the masterclock needs to be toggled + * on/off, or when starting a new generation and the masterclock is + * enabled (compute_guest_tsc() requires the masterclock snapshot to be + * taken _after_ the new generation is created). */ - if (ka->use_master_clock || - (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) + if ((ka->use_master_clock && new_generation) || + (ka->use_master_clock != use_master_clock)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -2706,7 +2709,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - kvm_track_tsc_matching(vcpu); + kvm_track_tsc_matching(vcpu, !matched); } static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) -- cgit v1.2.3 From a484755ab2526ebdbe042397cdd6e427eb4b1a68 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:41:03 -0700 Subject: Revert "nSVM: Check for reserved encodings of TLB_CONTROL in nested VMCB" Revert KVM's made-up consistency check on SVM's TLB control. The APM says that unsupported encodings are reserved, but the APM doesn't state that VMRUN checks for a supported encoding. Unless something is called out in "Canonicalization and Consistency Checks" or listed as MBZ (Must Be Zero), AMD behavior is typically to let software shoot itself in the foot. This reverts commit 174a921b6975ef959dd82ee9e8844067a62e3ec1. Fixes: 174a921b6975 ("nSVM: Check for reserved encodings of TLB_CONTROL in nested VMCB") Reported-by: Stefan Sterz Closes: https://lkml.kernel.org/r/b9915c9c-4cf6-051a-2d91-44cc6380f455%40proxmox.com Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231018194104.1896415-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3fea8c47679e..60891b9ce25f 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -247,18 +247,6 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } -static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) -{ - /* Nested FLUSHBYASID is not supported yet. */ - switch(tlb_ctl) { - case TLB_CONTROL_DO_NOTHING: - case TLB_CONTROL_FLUSH_ALL_ASID: - return true; - default: - return false; - } -} - static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, struct vmcb_ctrl_area_cached *control) { @@ -278,9 +266,6 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, IOPM_SIZE))) return false; - if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) - return false; - if (CC((control->int_ctl & V_NMI_ENABLE_MASK) && !vmcb12_is_intercept(control, INTERCEPT_NMI))) { return false; -- cgit v1.2.3 From 176bfc5b17fee327585583a427e2857d9dfd8f68 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:41:04 -0700 Subject: KVM: nSVM: Advertise support for flush-by-ASID Advertise support for FLUSHBYASID when nested SVM is enabled, as KVM can always emulate flushing TLB entries for a vmcb12 ASID, e.g. by running L2 with a new, fresh ASID in vmcb02. Some modern hypervisors, e.g. VMWare Workstation 17, require FLUSHBYASID support and will refuse to run if it's not present. Punt on proper support, as "Honor L1's request to flush an ASID on nested VMRUN" is one of the TODO items in the (incomplete) list of issues that need to be addressed in order for KVM to NOT do a full TLB flush on every nested SVM transition (see nested_svm_transition_tlb_flush()). Reported-by: Stefan Sterz Closes: https://lkml.kernel.org/r/b9915c9c-4cf6-051a-2d91-44cc6380f455%40proxmox.com Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231018194104.1896415-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 712146312358..30352c42d62a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5076,6 +5076,13 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVM); kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); + /* + * KVM currently flushes TLBs on *every* nested SVM transition, + * and so for all intents and purposes KVM supports flushing by + * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. + */ + kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); + if (nrips) kvm_cpu_cap_set(X86_FEATURE_NRIPS); -- cgit v1.2.3 From 770d6aa2e416fd26f0356e258c77a37574ad9b8c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:36:17 -0700 Subject: KVM: SVM: Explicitly require FLUSHBYASID to enable SEV support Add a sanity check that FLUSHBYASID is available if SEV is supported in hardware, as SEV (and beyond) guests are bound to a single ASID, i.e. KVM can't "flush" by assigning a new, fresh ASID to the guest. If FLUSHBYASID isn't supported for some bizarre reason, KVM would completely fail to do TLB flushes for SEV+ guests (see pre_svm_run() and pre_sev_run()). Cc: Tom Lendacky Link: https://lore.kernel.org/r/20231018193617.1895752-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4900c078045a..d0c580607f00 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2191,10 +2191,13 @@ void __init sev_hardware_setup(void) /* * SEV must obviously be supported in hardware. Sanity check that the * CPU supports decode assists, which is mandatory for SEV guests to - * support instruction emulation. + * support instruction emulation. Ditto for flushing by ASID, as SEV + * guests are bound to a single ASID, i.e. KVM can't rotate to a new + * ASID to effect a TLB flush. */ if (!boot_cpu_has(X86_FEATURE_SEV) || - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; /* Retrieve SEV CPUID information */ -- cgit v1.2.3 From 72046d0a077a8f70d4d1e5bdeed324c1a310da8c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Oct 2023 12:20:21 -0700 Subject: KVM: SVM: Don't intercept IRET when injecting NMI and vNMI is enabled When vNMI is enabled, rely entirely on hardware to correctly handle NMI blocking, i.e. don't intercept IRET to detect when NMIs are no longer blocked. KVM already correctly ignores svm->nmi_masked when vNMI is enabled, so the effect of the bug is essentially an unnecessary VM-Exit. KVM intercepts IRET for two reasons: - To track NMI masking to be able to know at any point of time if NMI is masked. - To track NMI windows (to inject another NMI after the guest executes IRET, i.e. unblocks NMIs) When vNMI is enabled, both cases are handled by hardware: - NMI masking state resides in int_ctl.V_NMI_BLOCKING and can be read by KVM at will. - Hardware automatically "injects" pending virtual NMIs when virtual NMIs become unblocked. However, even though pending a virtual NMI for hardware to handle is the most common way to synthesize a guest NMI, KVM may still directly inject an NMI via when KVM is handling two "simultaneous" NMIs (see comments in process_nmi() for details on KVM's simultaneous NMI handling). Per AMD's APM, hardware sets the BLOCKING flag when software directly injects an NMI as well, i.e. KVM doesn't need to manually mark vNMIs as blocked: If Event Injection is used to inject an NMI when NMI Virtualization is enabled, VMRUN sets V_NMI_MASK in the guest state. Note, it's still possible that KVM could trigger a spurious IRET VM-Exit. When running a nested guest, KVM disables vNMI for L2 and thus will enable IRET interception (in both vmcb01 and vmcb02) while running L2 reason. If a nested VM-Exit happens before L2 executes IRET, KVM can end up running L1 with vNMI enable and IRET intercepted. This is also a benign bug, and even less likely to happen, i.e. can be safely punted to a future fix. Fixes: fa4c027a7956 ("KVM: x86: Add support for SVM's Virtual NMI") Link: https://lore.kernel.org/all/ZOdnuDZUd4mevCqe@google.como Cc: Santosh Shukla Cc: Maxim Levitsky Tested-by: Santosh Shukla Link: https://lore.kernel.org/r/20231018192021.1893261-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 30352c42d62a..c46f07b28230 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3560,8 +3560,15 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) if (svm->nmi_l1_to_l2) return; - svm->nmi_masked = true; - svm_set_iret_intercept(svm); + /* + * No need to manually track NMI masking when vNMI is enabled, hardware + * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the + * case where software directly injects an NMI. + */ + if (!is_vnmi_enabled(svm)) { + svm->nmi_masked = true; + svm_set_iret_intercept(svm); + } ++vcpu->stat.nmi_injections; } -- cgit v1.2.3 From 15223c4f973a6120665ece9ce1ad17aec0be0e6c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 31 Oct 2023 08:52:40 +0100 Subject: KVM: SVM,VMX: Use %rip-relative addressing to access kvm_rebooting Instruction with %rip-relative address operand is one byte shorter than its absolute address counterpart and is also compatible with position independent executable (-fpie) build. No functional changes intended. Cc: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20231031075312.47525-1-ubizjak@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 10 +++++----- arch/x86/kvm/vmx/vmenter.S | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index ef2ebabb059c..9499f9c6b077 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -270,16 +270,16 @@ SYM_FUNC_START(__svm_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -10: cmpb $0, kvm_rebooting +10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b ud2 -30: cmpb $0, kvm_rebooting +30: cmpb $0, _ASM_RIP(kvm_rebooting) jne 4b ud2 -50: cmpb $0, kvm_rebooting +50: cmpb $0, _ASM_RIP(kvm_rebooting) jne 6b ud2 -70: cmpb $0, kvm_rebooting +70: cmpb $0, _ASM_RIP(kvm_rebooting) jne 8b ud2 @@ -381,7 +381,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -3: cmpb $0, kvm_rebooting +3: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b ud2 diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index be275a0410a8..906ecd001511 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -289,7 +289,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) RET .Lfixup: - cmpb $0, kvm_rebooting + cmpb $0, _ASM_RIP(kvm_rebooting) jne .Lvmfail ud2 .Lvmfail: -- cgit v1.2.3 From cbb359d81a2695bb5e63ec9de06fcbef28518891 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:36 -0700 Subject: KVM: x86/pmu: Move PMU reset logic to common x86 code Move the common (or at least "ignored") aspects of resetting the vPMU to common x86 code, along with the stop/release helpers that are no used only by the common pmu.c. There is no need to manually handle fixed counters as all_valid_pmc_idx tracks both fixed and general purpose counters, and resetting the vPMU is far from a hot path, i.e. the extra bit of overhead to the PMC from the index is a non-issue. Zero fixed_ctr_ctrl in common code even though it's Intel specific. Ensuring it's zero doesn't harm AMD/SVM in any way, and stopping the fixed counters via all_valid_pmc_idx, but not clearing the associated control bits, would be odd/confusing. Make the .reset() hook optional as SVM no longer needs vendor specific handling. Cc: stable@vger.kernel.org Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20231103230541.352265-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 +- arch/x86/kvm/pmu.c | 40 +++++++++++++++++++++++++++++++++- arch/x86/kvm/pmu.h | 18 --------------- arch/x86/kvm/svm/pmu.c | 16 -------------- arch/x86/kvm/vmx/pmu_intel.c | 20 ----------------- 5 files changed, 40 insertions(+), 56 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 6c98f4bb4228..058bc636356a 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -22,7 +22,7 @@ KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) KVM_X86_PMU_OP(refresh) KVM_X86_PMU_OP(init) -KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 9ae07db6f0f6..027e9c3c2b93 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -250,6 +250,24 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static void pmc_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); + } +} + static int filter_cmp(const void *pa, const void *pb, u64 mask) { u64 a = *(u64 *)pa & mask; @@ -654,7 +672,27 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - static_call(kvm_x86_pmu_reset)(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); + if (!pmc) + continue; + + pmc_stop_counter(pmc); + pmc->counter = 0; + + if (pmc_is_gp(pmc)) + pmc->eventsel = 0; + } + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + + static_call_cond(kvm_x86_pmu_reset)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 1d64113de488..a46aa9b25150 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -80,24 +80,6 @@ static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) pmc->counter &= pmc_bitmask(pmc); } -static inline void pmc_release_perf_event(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - pmc->current_config = 0; - pmc_to_pmu(pmc)->event_count--; - } -} - -static inline void pmc_stop_counter(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); - pmc_release_perf_event(pmc); - } -} - static inline bool pmc_is_gp(struct kvm_pmc *pmc) { return pmc->type == KVM_PMC_GP; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 373ff6a6687b..3fd47de14b38 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -233,21 +233,6 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } } -static void amd_pmu_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int i; - - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; - } - - pmu->global_ctrl = pmu->global_status = 0; -} - struct kvm_pmu_ops amd_pmu_ops __initdata = { .hw_event_available = amd_hw_event_available, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, @@ -259,7 +244,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, - .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 820d3e1f6b4f..90c1f7f07e53 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -632,26 +632,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) static void intel_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = NULL; - int i; - - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { - pmc = &pmu->gp_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; - } - - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { - pmc = &pmu->fixed_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = 0; - } - - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; - intel_pmu_release_guest_lbr_event(vcpu); } -- cgit v1.2.3 From 1647b52757d59131fe30cf73fa36fac834d4367f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:37 -0700 Subject: KVM: x86/pmu: Reset the PMU, i.e. stop counters, before refreshing Stop all counters and release all perf events before refreshing the vPMU, i.e. before reconfiguring the vPMU to respond to changes in the vCPU model. Clear need_cleanup in kvm_pmu_reset() as well so that KVM doesn't prematurely stop counters, e.g. if KVM enters the guest and enables counters before the vCPU is scheduled out. Cc: stable@vger.kernel.org Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20231103230541.352265-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 027e9c3c2b93..dc8e8e907cfb 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -657,25 +657,14 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -/* refresh PMU settings. This function generally is called when underlying - * settings are changed (such as changes of PMU CPUID by guest VMs), which - * should rarely happen. - */ -void kvm_pmu_refresh(struct kvm_vcpu *vcpu) -{ - if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) - return; - - bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); - static_call(kvm_x86_pmu_refresh)(vcpu); -} - void kvm_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; int i; + pmu->need_cleanup = false; + bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { @@ -695,6 +684,26 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) static_call_cond(kvm_x86_pmu_reset)(vcpu); } + +/* + * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID + * and/or PERF_CAPABILITIES. + */ +void kvm_pmu_refresh(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + + /* + * Stop/release all existing counters/events before realizing the new + * vPMU model. + */ + kvm_pmu_reset(vcpu); + + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); + static_call(kvm_x86_pmu_refresh)(vcpu); +} + void kvm_pmu_init(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); -- cgit v1.2.3 From f2f63f7ec6fd13d2d5d5c6d90ea438fbb5a36adc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:38 -0700 Subject: KVM: x86/pmu: Stop calling kvm_pmu_reset() at RESET (it's redundant) Drop kvm_vcpu_reset()'s call to kvm_pmu_reset(), the call is performed only for RESET, which is really just the same thing as vCPU creation, and kvm_arch_vcpu_create() *just* called kvm_pmu_init(), i.e. there can't possibly be any work to do. Unlike Intel, AMD's amd_pmu_refresh() does fill all_valid_pmc_idx even if guest CPUID is empty, but everything that is at all dynamic is guaranteed to be '0'/NULL, e.g. it should be impossible for KVM to have already created a perf event. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20231103230541.352265-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/pmu.h | 1 - arch/x86/kvm/x86.c | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index dc8e8e907cfb..458e836c6efe 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -657,7 +657,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -void kvm_pmu_reset(struct kvm_vcpu *vcpu) +static void kvm_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a46aa9b25150..db9a12c0a2ef 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -243,7 +243,6 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); -void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..33c071cd95b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12221,7 +12221,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } if (!init_event) { - kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; -- cgit v1.2.3 From ec61b2306dfd117ba5db93dfb54808523ea2b5e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:39 -0700 Subject: KVM: x86/pmu: Remove manual clearing of fields in kvm_pmu_init() Remove code that unnecessarily clears event_count and need_cleanup in kvm_pmu_init(), the entire kvm_pmu is zeroed just a few lines earlier. Vendor code doesn't set event_count or need_cleanup during .init(), and if either VMX or SVM did set those fields it would be a flagrant bug. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20231103230541.352265-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 458e836c6efe..c06090196b00 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -710,8 +710,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - pmu->event_count = 0; - pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } -- cgit v1.2.3 From 89acf1237b81802328beaa094b1139dbb2561883 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:40 -0700 Subject: KVM: x86/pmu: Update sample period in pmc_write_counter() Update a PMC's sample period in pmc_write_counter() to deduplicate code across all callers of pmc_write_counter(). Opportunistically move pmc_write_counter() into pmc.c now that it's doing more work. WRMSR isn't such a hot path that an extra CALL+RET pair will be problematic, and the order of function definitions needs to be changed anyways, i.e. now is a convenient time to eat the churn. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20231103230541.352265-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 27 +++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 25 +------------------------ arch/x86/kvm/svm/pmu.c | 1 - arch/x86/kvm/vmx/pmu_intel.c | 2 -- 4 files changed, 28 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c06090196b00..3725d001239d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -161,6 +161,15 @@ static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) return 1; } +static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) +{ + u64 sample_period = (-counter_value) & pmc_bitmask(pmc); + + if (!sample_period) + sample_period = pmc_bitmask(pmc) + 1; + return sample_period; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -268,6 +277,24 @@ static void pmc_stop_counter(struct kvm_pmc *pmc) } } +static void pmc_update_sample_period(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event || pmc->is_paused || + !is_sampling_event(pmc->perf_event)) + return; + + perf_event_period(pmc->perf_event, + get_sample_period(pmc, pmc->counter)); +} + +void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); + pmc_update_sample_period(pmc); +} +EXPORT_SYMBOL_GPL(pmc_write_counter); + static int filter_cmp(const void *pa, const void *pb, u64 mask) { u64 a = *(u64 *)pa & mask; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index db9a12c0a2ef..cae85e550f60 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -74,11 +74,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) -{ - pmc->counter += val - pmc_read_counter(pmc); - pmc->counter &= pmc_bitmask(pmc); -} +void pmc_write_counter(struct kvm_pmc *pmc, u64 val); static inline bool pmc_is_gp(struct kvm_pmc *pmc) { @@ -128,25 +124,6 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) -{ - u64 sample_period = (-counter_value) & pmc_bitmask(pmc); - - if (!sample_period) - sample_period = pmc_bitmask(pmc) + 1; - return sample_period; -} - -static inline void pmc_update_sample_period(struct kvm_pmc *pmc) -{ - if (!pmc->perf_event || pmc->is_paused || - !is_sampling_event(pmc->perf_event)) - return; - - perf_event_period(pmc->perf_event, - get_sample_period(pmc, pmc->counter)); -} - static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 3fd47de14b38..b6a7ad4d6914 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -161,7 +161,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); return 0; } /* MSR_EVNTSELn */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 90c1f7f07e53..a6216c874729 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,11 +437,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { reserved_bits = pmu->reserved_bits; -- cgit v1.2.3 From fd89499a5151d197ba30f7b801f6d8f4646cf446 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Nov 2023 16:05:41 -0700 Subject: KVM: x86/pmu: Track emulated counter events instead of previous counter Explicitly track emulated counter events instead of using the common counter value that's shared with the hardware counter owned by perf. Bumping the common counter requires snapshotting the pre-increment value in order to detect overflow from emulation, and the snapshot approach is inherently flawed. Snapshotting the previous counter at every increment assumes that there is at most one emulated counter event per emulated instruction (or rather, between checks for KVM_REQ_PMU). That's mostly holds true today because KVM only emulates (branch) instructions retired, but the approach will fall apart if KVM ever supports event types that don't have a 1:1 relationship with instructions. And KVM already has a relevant bug, as handle_invalid_guest_state() emulates multiple instructions without checking KVM_REQ_PMU, i.e. could miss an overflow event due to clobbering pmc->prev_counter. Not checking KVM_REQ_PMU is problematic in both cases, but at least with the emulated counter approach, the resulting behavior is delayed overflow detection, as opposed to completely lost detection. Tracking the emulated count fixes another bug where the snapshot approach can signal spurious overflow due to incorporating both the emulated count and perf's count in the check, i.e. if overflow is detected by perf, then KVM's emulation will also incorrectly signal overflow. Add a comment in the related code to call out the need to process emulated events *after* pausing the perf event (big kudos to Mingwei for figuring out that particular wrinkle). Cc: Mingwei Zhang Cc: Roman Kagan Cc: Jim Mattson Cc: Dapeng Mi Cc: Like Xu Reviewed-by: Mingwei Zhang Link: https://lore.kernel.org/r/20231103230541.352265-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 17 ++++++++++++++- arch/x86/kvm/pmu.c | 48 ++++++++++++++++++++++++++++++----------- arch/x86/kvm/pmu.h | 3 ++- 3 files changed, 53 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a565a2e70f30..562eaa938847 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -500,8 +500,23 @@ struct kvm_pmc { u8 idx; bool is_paused; bool intr; + /* + * Base value of the PMC counter, relative to the *consumed* count in + * the associated perf_event. This value includes counter updates from + * the perf_event and emulated_count since the last time the counter + * was reprogrammed, but it is *not* the current value as seen by the + * guest or userspace. + * + * The count is relative to the associated perf_event so that KVM + * doesn't need to reprogram the perf_event every time the guest writes + * to the counter. + */ u64 counter; - u64 prev_counter; + /* + * PMC events triggered by KVM emulation that haven't been fully + * processed, i.e. haven't undergone overflow detection. + */ + u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3725d001239d..87cc6c8809ad 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -127,9 +127,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; /* - * Ignore overflow events for counters that are scheduled to be - * reprogrammed, e.g. if a PMI for the previous event races with KVM's - * handling of a related guest WRMSR. + * Ignore asynchronous overflow events for counters that are scheduled + * to be reprogrammed, e.g. if a PMI for the previous event races with + * KVM's handling of a related guest WRMSR. */ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) return; @@ -224,17 +224,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, return 0; } -static void pmc_pause_counter(struct kvm_pmc *pmc) +static bool pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - - if (!pmc->perf_event || pmc->is_paused) - return; + u64 prev_counter; /* update counter, reset event value to avoid redundant accumulation */ - counter += perf_event_pause(pmc->perf_event, true); + if (pmc->perf_event && !pmc->is_paused) + counter += perf_event_pause(pmc->perf_event, true); + + /* + * Snapshot the previous counter *after* accumulating state from perf. + * If overflow already happened, hardware (via perf) is responsible for + * generating a PMI. KVM just needs to detect overflow on emulated + * counter events that haven't yet been processed. + */ + prev_counter = counter & pmc_bitmask(pmc); + + counter += pmc->emulated_counter; pmc->counter = counter & pmc_bitmask(pmc); + + pmc->emulated_counter = 0; pmc->is_paused = true; + + return pmc->counter < prev_counter; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -289,6 +302,15 @@ static void pmc_update_sample_period(struct kvm_pmc *pmc) void pmc_write_counter(struct kvm_pmc *pmc, u64 val) { + /* + * Drop any unconsumed accumulated counts, the WRMSR is a write, not a + * read-modify-write. Adjust the counter value so that its value is + * relative to the current count, as reading the current count from + * perf is faster than pausing and repgrogramming the event in order to + * reset it to '0'. Note, this very sneakily offsets the accumulated + * emulated count too, by using pmc_read_counter()! + */ + pmc->emulated_counter = 0; pmc->counter += val - pmc_read_counter(pmc); pmc->counter &= pmc_bitmask(pmc); pmc_update_sample_period(pmc); @@ -428,14 +450,15 @@ static void reprogram_counter(struct kvm_pmc *pmc) struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; u64 new_config = eventsel; + bool emulate_overflow; u8 fixed_ctr_ctrl; - pmc_pause_counter(pmc); + emulate_overflow = pmc_pause_counter(pmc); if (!pmc_event_is_allowed(pmc)) goto reprogram_complete; - if (pmc->counter < pmc->prev_counter) + if (emulate_overflow) __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -475,7 +498,6 @@ static void reprogram_counter(struct kvm_pmc *pmc) reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); - pmc->prev_counter = 0; } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) @@ -701,6 +723,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = 0; + pmc->emulated_counter = 0; if (pmc_is_gp(pmc)) pmc->eventsel = 0; @@ -772,8 +795,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->prev_counter = pmc->counter; - pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + pmc->emulated_counter++; kvm_pmu_request_counter_reprogram(pmc); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cae85e550f60..7caeb3d8d4fd 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -66,7 +66,8 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; - counter = pmc->counter; + counter = pmc->counter + pmc->emulated_counter; + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); -- cgit v1.2.3 From 63912245c19d3a4179da44beefd017eb9270f207 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 15 Mar 2023 18:16:06 +0800 Subject: KVM: move KVM_CAP_DEVICE_CTRL to the generic check KVM_CAP_DEVICE_CTRL allows userspace to check if the kvm_device framework (e.g. KVM_CREATE_DEVICE) is supported by KVM. Move KVM_CAP_DEVICE_CTRL to the generic check for the two reasons: 1) it already supports arch agnostic usages (i.e. KVM_DEV_TYPE_VFIO). For example, userspace VFIO implementation may needs to create KVM_DEV_TYPE_VFIO on x86, riscv, or arm etc. It is simpler to have it checked at the generic code than at each arch's code. 2) KVM_CREATE_DEVICE has been added to the generic code. Link: https://lore.kernel.org/all/20221215115207.14784-1-wei.w.wang@intel.com Signed-off-by: Wei Wang Reviewed-by: Sean Christopherson Acked-by: Anup Patel (riscv) Reviewed-by: Oliver Upton Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20230315101606.10636-1-wei.w.wang@intel.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 1 - arch/powerpc/kvm/powerpc.c | 1 - arch/riscv/kvm/vm.c | 1 - arch/s390/kvm/kvm-s390.c | 1 - virt/kvm/kvm_main.c | 1 + 5 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e5f75f1f1085..9725783745b4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -221,7 +221,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = vgic_present; break; case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c39945a7fce3..13d8309c7ba9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -528,7 +528,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_SET_GUEST_DEBUG: r = 1; diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 7e2b50c692c1..ce58bc48e5b8 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -179,7 +179,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_riscv_aia_available(); break; case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7aa0e668488f..39463d0e4a1c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -563,7 +563,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8758cb799e18..b744eddecbc0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4873,6 +4873,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: + case KVM_CAP_DEVICE_CTRL: return 1; #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_CAP_MEMORY_ATTRIBUTES: -- cgit v1.2.3 From 573cc0e5cf142d9992d2de3502800890fc717bc0 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 2 Nov 2023 19:15:24 +0100 Subject: KVM: x86: Harden copying of userspace-array against overflow cpuid.c utilizes vmemdup_user() and array_size() to copy two userspace arrays. This, currently, does not check for an overflow. Use the new wrapper vmemdup_array_user() to copy the arrays more safely, as vmemdup_user() doesn't check for overflow. Note, KVM explicitly checks the number of entries before duplicating the array, i.e. adding the overflow check should be a glorified nop. Suggested-by: Dave Airlie Signed-off-by: Philipp Stanner Link: https://lore.kernel.org/r/20231102181526.43279-2-pstanner@redhat.com [sean: call out that KVM pre-checks the number of entries] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..ad441f0d2917 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -469,7 +469,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, return -E2BIG; if (cpuid->nent) { - e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e)); if (IS_ERR(e)) return PTR_ERR(e); @@ -513,7 +513,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return -E2BIG; if (cpuid->nent) { - e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2)); if (IS_ERR(e2)) return PTR_ERR(e2); } -- cgit v1.2.3 From 1aa4bb916808503bf6fedd00f50f2077f91cebaa Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 27 Oct 2023 10:26:38 -0700 Subject: KVM: x86/mmu: Fix off-by-1 when splitting huge pages during CLEAR Fix an off-by-1 error when passing in the range of pages to kvm_mmu_try_split_huge_pages() during CLEAR_DIRTY_LOG. Specifically, end is the last page that needs to be split (inclusive) so pass in `end + 1` since kvm_mmu_try_split_huge_pages() expects the `end` to be non-inclusive. At worst this will cause a huge page to be write-protected instead of eagerly split, which is purely a performance issue, not a correctness issue. But even that is unlikely as it would require userspace pass in a bitmap where the last page is the only 4K page on a huge page that needs to be split. Reported-by: Vipin Sharma Fixes: cb00a70bd4b7 ("KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG") Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20231027172640.2335197-2-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59b026b6ad2a..29ac130fcb84 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1392,7 +1392,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); if (READ_ONCE(eager_page_split)) - kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); + kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); -- cgit v1.2.3 From 45a61ebb221117748d3567a86908618f431ac824 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 27 Oct 2023 10:26:39 -0700 Subject: KVM: x86/mmu: Check for leaf SPTE when clearing dirty bit in the TDP MMU Re-check that the given SPTE is still a leaf and present SPTE after a failed cmpxchg in clear_dirty_gfn_range(). clear_dirty_gfn_range() intends to only operate on present leaf SPTEs, but that could change after a failed cmpxchg. A check for present was added in commit 3354ef5a592d ("KVM: x86/mmu: Check for present SPTE when clearing dirty bit in TDP MMU") but the check for leaf is still buried in tdp_root_for_each_leaf_pte() and does not get rechecked on retry. Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20231027172640.2335197-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6cd4dd631a2f..038983b13574 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1522,12 +1522,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, start, end) { + tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_shadow_present_pte(iter.old_spte)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; KVM_MMU_WARN_ON(kvm_ad_enabled() && -- cgit v1.2.3 From 5f3c8c9187b6fa8675951f9fad5b99b11fed21f6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 25 Nov 2023 03:33:57 -0500 Subject: KVM: x86/mmu: remove unnecessary "bool shared" argument from functions Neither tdp_mmu_next_root nor kvm_tdp_mmu_put_root need to know if the lock is taken for read or write. Either way, protection is achieved via RCU and tdp_mmu_pages_lock. Remove the argument and just assert that the lock is taken. Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231125083400.1399197-2-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 34 +++++++++++++++++++++------------- arch/x86/kvm/mmu/tdp_mmu.h | 3 +-- 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 29ac130fcb84..ace9f7c13132 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3580,7 +3580,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; if (is_tdp_mmu_page(sp)) - kvm_tdp_mmu_put_root(kvm, sp, false); + kvm_tdp_mmu_put_root(kvm, sp); else if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 038983b13574..8cd805fa1516 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -73,10 +73,13 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared) +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { - kvm_lockdep_assert_mmu_lock_held(kvm, shared); + /* + * Either read or write is okay, but mmu_lock must be held because + * writers are not required to take tdp_mmu_pages_lock. + */ + lockdep_assert_held(&kvm->mmu_lock); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; @@ -106,10 +109,16 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool shared, bool only_valid) + bool only_valid) { struct kvm_mmu_page *next_root; + /* + * While the roots themselves are RCU-protected, fields such as + * role.invalid are protected by mmu_lock. + */ + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); if (prev_root) @@ -132,7 +141,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, rcu_read_unlock(); if (prev_root) - kvm_tdp_mmu_put_root(kvm, prev_root, shared); + kvm_tdp_mmu_put_root(kvm, prev_root); return next_root; } @@ -144,13 +153,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * recent root. (Unless keeping a live reference is desirable.) * * If shared is set, this function is operating under the MMU lock in read - * mode. In the unlikely event that this thread must free a root, the lock - * will be temporarily dropped and reacquired in write mode. + * mode. */ #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ kvm_mmu_page_as_id(_root) != _as_id) { \ } else @@ -159,9 +167,9 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ + for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ + _root = tdp_mmu_next_root(_kvm, _root, false)) \ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ } else @@ -891,7 +899,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * the root must be reachable by mmu_notifiers while it's being * zapped */ - kvm_tdp_mmu_put_root(kvm, root, true); + kvm_tdp_mmu_put_root(kvm, root); } read_unlock(&kvm->mmu_lock); @@ -1500,7 +1508,7 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); if (r) { - kvm_tdp_mmu_put_root(kvm, root, shared); + kvm_tdp_mmu_put_root(kvm, root); break; } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 733a3aef3a96..20d97aa46c49 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -17,8 +17,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) return refcount_inc_not_zero(&root->tdp_mmu_root_count); } -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); -- cgit v1.2.3 From 484dd27c0602e01cb49db362ad42b95e70912d43 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 25 Nov 2023 03:33:58 -0500 Subject: KVM: x86/mmu: remove unnecessary "bool shared" argument from iterators The "bool shared" argument is more or less unnecessary in the for_each_*_tdp_mmu_root_yield_safe() macros. Many users check for the lock before calling it; all of them either call small functions that do the check, or end up calling tdp_mmu_set_spte_atomic() and tdp_mmu_iter_set_spte(). Add a few assertions to make up for the lost check in for_each_*_tdp_mmu_root_yield_safe(), but even this is probably overkill and mostly for documentation reasons. Signed-off-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231125083400.1399197-3-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 48 ++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8cd805fa1516..c6dc09acea12 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -155,23 +155,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else -#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) +#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, false)) \ - if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ - } else +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, false)) /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -840,7 +837,8 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, false) + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -862,7 +860,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for_each_tdp_mmu_root_yield_safe(kvm, root, false) + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root) tdp_mmu_zap_root(kvm, root, false); } @@ -876,7 +875,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) read_lock(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, true) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { if (!root->tdp_mmu_scheduled_root_to_zap) continue; @@ -1133,7 +1132,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, range->may_block, flush); @@ -1322,7 +1321,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); @@ -1354,6 +1353,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, { struct kvm_mmu_page *sp; + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + /* * Since we are allocating while under the MMU lock we have to be * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct @@ -1504,8 +1505,7 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, int r = 0; kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); if (r) { kvm_tdp_mmu_put_root(kvm, root); @@ -1569,8 +1569,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1704,8 +1703,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) zap_collapsible_spte_range(kvm, root, slot); } -- cgit v1.2.3 From 250ce1b4d21a94f910c3df5141ff6434ea92524e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 25 Nov 2023 03:33:59 -0500 Subject: KVM: x86/mmu: always take tdp_mmu_pages_lock It is cheap to take tdp_mmu_pages_lock in all write-side critical sections. We already do it all the time when zapping with read_lock(), so it is not a problem to do it from the kvm_tdp_mmu_zap_all() path (aka kvm_arch_flush_shadow_all(), aka VM destruction and MMU notifier release). Signed-off-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231125083400.1399197-4-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/locking.rst | 7 +++---- arch/x86/include/asm/kvm_host.h | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 24 ++++-------------------- 3 files changed, 13 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 3a034db5e55f..02880d5552d5 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -43,10 +43,9 @@ On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock and kvm->arch.xen.xen_lock -- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and - kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and - cannot be taken without already holding kvm->arch.mmu_lock (typically with - ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). +- kvm->arch.mmu_lock is an rwlock; critical sections for + kvm->arch.tdp_mmu_pages_lock and kvm->arch.mmu_unsync_pages_lock must + also take kvm->arch.mmu_lock Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a565a2e70f30..414691169a04 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1407,9 +1407,8 @@ struct kvm_arch { * the MMU lock in read mode + RCU or * the MMU lock in write mode * - * For writes, this list is protected by: - * the MMU lock in read mode + the tdp_mmu_pages_lock or - * the MMU lock in write mode + * For writes, this list is protected by tdp_mmu_pages_lock; see + * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the @@ -1426,8 +1425,10 @@ struct kvm_arch { * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU - * It is acceptable, but not necessary, to acquire this lock when - * the thread holds the MMU lock in write mode. + * Because the lock is only taken within the MMU lock, strictly + * speaking it is redundant to acquire this lock when the thread + * holds the MMU lock in write mode. However it often simplifies + * the code to do so. */ spinlock_t tdp_mmu_pages_lock; #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c6dc09acea12..6ae19b4ee5b1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -75,12 +75,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { - /* - * Either read or write is okay, but mmu_lock must be held because - * writers are not required to take tdp_mmu_pages_lock. - */ - lockdep_assert_held(&kvm->mmu_lock); - if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; @@ -281,28 +275,18 @@ static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) * * @kvm: kvm instance * @sp: the page to be removed - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared) +static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { tdp_unaccount_mmu_page(kvm, sp); if (!sp->nx_huge_page_disallowed) return; - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); sp->nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -331,7 +315,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_sp(kvm, sp, shared); + tdp_mmu_unlink_sp(kvm, sp); for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; -- cgit v1.2.3 From e59f75de4e501e87de7743fec29dd247a6ae6cd3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 25 Nov 2023 03:34:00 -0500 Subject: KVM: x86/mmu: fix comment about mmu_unsync_pages_lock Fix the comment about what can and cannot happen when mmu_unsync_pages_lock is not help. The comment correctly mentions "clearing sp->unsync", but then it talks about unsync going from 0 to 1. Signed-off-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231125083400.1399197-5-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ace9f7c13132..3fec3f6cdd53 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2850,9 +2850,9 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, /* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false - * positive on the unprotected check above is not + * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock - * for write, i.e. unsync cannot transition from 0->1 + * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write). */ if (READ_ONCE(sp->unsync)) -- cgit v1.2.3 From 8c4976772d9b5858b8b456e84783e089c6cfa66e Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 2 Nov 2023 19:15:25 +0100 Subject: KVM: s390: Harden copying of userspace-array against overflow guestdbg.c utilizes memdup_user() to copy a userspace array. This, currently, does not check for an overflow. Use the new wrapper memdup_array_user() to copy the array more safely. Note, KVM explicitly checks the number of entries before duplicating the array, i.e. adding the overflow check should be a glorified nop. Suggested-by: Dave Airlie Signed-off-by: Philipp Stanner Acked-by: Claudio Imbrenda Acked-by: Christian Borntraeger Link: https://lore.kernel.org/r/20231102181526.43279-3-pstanner@redhat.com [sean: call out that KVM pre-checks the number of entries] Signed-off-by: Sean Christopherson --- arch/s390/kvm/guestdbg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 3765c4223bf9..80879fc73c90 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - bp_data = memdup_user(dbg->arch.hw_bp, - sizeof(*bp_data) * dbg->arch.nr_hw_bp); + bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp, + sizeof(*bp_data)); if (IS_ERR(bp_data)) return PTR_ERR(bp_data); -- cgit v1.2.3 From 87562052c965ba7de6dc490434e53691fe46c898 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:15 +0100 Subject: KVM: x86/xen: Remove unneeded xen context from kvm_arch when !CONFIG_KVM_XEN Saving a few bytes of memory per KVM VM is certainly great but what's more important is the ability to see where the code accesses Xen emulation context while CONFIG_KVM_XEN is not enabled. Currently, kvm_cpu_get_extint() is the only such place and it is harmless: kvm_xen_has_interrupt() always returns '0' when !CONFIG_KVM_XEN. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/irq.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5e5e9e0abd4a..b0ca65632f2c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1136,6 +1136,7 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; @@ -1147,6 +1148,7 @@ struct kvm_xen { struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1349,7 +1351,10 @@ struct kvm_arch { struct hlist_head mask_notifier_list; struct kvm_hv hyperv; + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b2c397dd2bc6..ad9ca8a60144 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -118,8 +118,10 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; +#ifdef CONFIG_KVM_XEN if (kvm_xen_has_interrupt(v)) return v->kvm->arch.xen.upcall_vector; +#endif if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; -- cgit v1.2.3 From cfef5af3cb0e57501dcac2816ab11a20c074866d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:16 +0100 Subject: KVM: x86: Move Hyper-V partition assist page out of Hyper-V emulation context Hyper-V partition assist page is used when KVM runs on top of Hyper-V and is not used for Windows/Hyper-V guests on KVM, this means that 'hv_pa_pg' placement in 'struct kvm_hv' is unfortunate. As a preparation to making Hyper-V emulation optional, move 'hv_pa_pg' to 'struct kvm_arch' and put it under CONFIG_HYPERV. While on it, introduce hv_get_partition_assist_page() helper to allocate partition assist page. Move the comment explaining why we use a single page for all vCPUs from VMX and expand it a bit. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/kvm_onhyperv.h | 20 ++++++++++++++++++++ arch/x86/kvm/svm/svm_onhyperv.c | 10 +++------- arch/x86/kvm/vmx/vmx.c | 14 +++----------- arch/x86/kvm/x86.c | 4 +++- 5 files changed, 30 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b0ca65632f2c..86069b985d22 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1125,7 +1125,6 @@ struct kvm_hv { */ unsigned int synic_auto_eoi_used; - struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; @@ -1447,6 +1446,7 @@ struct kvm_arch { #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; + struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index f9ca3e7432b2..eefab3dc8498 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -10,6 +10,26 @@ int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); int hv_flush_remote_tlbs(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); +static inline hpa_t hv_get_partition_assist_page(struct kvm_vcpu *vcpu) +{ + /* + * Partition assist page is something which Hyper-V running in L0 + * requires from KVM running in L1 before direct TLB flush for L2 + * guests can be enabled. KVM doesn't currently use the page but to + * comply with TLFS it still needs to be allocated. For now, this + * is a single page shared among all vCPUs. + */ + struct hv_partition_assist_pg **p_hv_pa_pg = + &vcpu->kvm->arch.hv_pa_pg; + + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); + + if (!*p_hv_pa_pg) + return INVALID_PAGE; + + return __pa(*p_hv_pa_pg); +} #else /* !CONFIG_HYPERV */ static inline int hv_flush_remote_tlbs(struct kvm *kvm) { diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 7af8422d3382..3971b3ea5d04 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -18,18 +18,14 @@ int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; - struct hv_partition_assist_pg **p_hv_pa_pg = - &to_kvm_hv(vcpu->kvm)->hv_pa_pg; + hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); - if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); - - if (!*p_hv_pa_pg) + if (partition_assist_page == INVALID_PAGE) return -ENOMEM; hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; - hve->partition_assist_page = __pa(*p_hv_pa_pg); + hve->partition_assist_page = partition_assist_page; hve->hv_vm_id = (unsigned long)vcpu->kvm; if (!hve->hv_enlightenments_control.nested_flush_hypercall) { hve->hv_enlightenments_control.nested_flush_hypercall = 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40e3780d73ae..cf19a3346639 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -523,22 +523,14 @@ module_param(enlightened_vmcs, bool, 0444); static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; - struct hv_partition_assist_pg **p_hv_pa_pg = - &to_kvm_hv(vcpu->kvm)->hv_pa_pg; - /* - * Synthetic VM-Exit is not enabled in current code and so All - * evmcs in singe VM shares same assist page. - */ - if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); + hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); - if (!*p_hv_pa_pg) + if (partition_assist_page == INVALID_PAGE) return -ENOMEM; evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; - evmcs->partition_assist_page = - __pa(*p_hv_pa_pg); + evmcs->partition_assist_page = partition_assist_page; evmcs->hv_vm_id = (unsigned long)vcpu->kvm; evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..81224b9676d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12438,7 +12438,9 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { - kfree(to_kvm_hv(kvm)->hv_pa_pg); +#if IS_ENABLED(CONFIG_HYPERV) + kfree(kvm->arch.hv_pa_pg); +#endif __kvm_arch_free_vm(kvm); } -- cgit v1.2.3 From 50a82b0eb88c108d1ebc73a97f5b81df0d5918e0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:17 +0100 Subject: KVM: VMX: Split off vmx_onhyperv.{ch} from hyperv.{ch} hyperv.{ch} is currently a mix of stuff which is needed by both Hyper-V on KVM and KVM on Hyper-V. As a preparation to making Hyper-V emulation optional, put KVM-on-Hyper-V specific code into dedicated files. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-4-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 4 + arch/x86/kvm/vmx/hyperv.c | 139 ------------------------- arch/x86/kvm/vmx/hyperv.h | 217 +++++++++++++++++++--------------------- arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/vmx/vmx_onhyperv.c | 36 +++++++ arch/x86/kvm/vmx/vmx_onhyperv.h | 124 +++++++++++++++++++++++ arch/x86/kvm/vmx/vmx_ops.h | 2 +- 7 files changed, 271 insertions(+), 252 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmx_onhyperv.c create mode 100644 arch/x86/kvm/vmx/vmx_onhyperv.h (limited to 'arch') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 80e3fe184d17..a99ffc3f3a3f 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -26,6 +26,10 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o +ifdef CONFIG_HYPERV +kvm-intel-y += vmx/vmx_onhyperv.o +endif + kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ svm/sev.o svm/hyperv.o diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 313b8bb5b8a7..de13dc14fe1d 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,111 +13,6 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_ENABLE_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} @@ -608,40 +503,6 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } -#if IS_ENABLED(CONFIG_HYPERV) -DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); - -/* - * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption - * is: in case a feature has corresponding fields in eVMCS described and it was - * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a - * feature which has no corresponding eVMCS field, this likely means that KVM - * needs to be updated. - */ -#define evmcs_check_vmcs_conf(field, ctrl) \ - do { \ - typeof(vmcs_conf->field) unsupported; \ - \ - unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ - if (unsupported) { \ - pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ - (u64)unsupported); \ - vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ - } \ - } \ - while (0) - -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); - evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); - evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); - evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); - evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); - evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); -} -#endif - int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 9623fe1651c4..9401dbfaea7c 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -14,12 +14,113 @@ #include "vmcs.h" #include "vmcs12.h" -struct vmcs_config; - -#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) - #define KVM_EVMCS_VERSION 1 +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) + struct evmcs_field { u16 offset; u16 clean_field; @@ -65,114 +166,6 @@ static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, return vmcs12_read_any((void *)evmcs, field, offset); } -#if IS_ENABLED(CONFIG_HYPERV) - -DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); - -static __always_inline bool kvm_is_using_evmcs(void) -{ - return static_branch_unlikely(&__kvm_is_using_evmcs); -} - -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) -{ - int offset = evmcs_field_offset(field, clean_field); - - WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); - return offset; -} - -static __always_inline void evmcs_write64(unsigned long field, u64 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u64 *)((char *)current_evmcs + offset) = value; - - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static __always_inline void evmcs_write32(unsigned long field, u32 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u32 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static __always_inline void evmcs_write16(unsigned long field, u16 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u16 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static __always_inline u64 evmcs_read64(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u64 *)((char *)current_evmcs + offset); -} - -static __always_inline u32 evmcs_read32(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u32 *)((char *)current_evmcs + offset); -} - -static __always_inline u16 evmcs_read16(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u16 *)((char *)current_evmcs + offset); -} - -static inline void evmcs_load(u64 phys_addr) -{ - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(smp_processor_id()); - - if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) - vp_ap->nested_control.features.directhypercall = 1; - vp_ap->current_nested_vmcs = phys_addr; - vp_ap->enlighten_vmentry = 1; -} - -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); -#else /* !IS_ENABLED(CONFIG_HYPERV) */ -static __always_inline bool kvm_is_using_evmcs(void) { return false; } -static __always_inline void evmcs_write64(unsigned long field, u64 value) {} -static __always_inline void evmcs_write32(unsigned long field, u32 value) {} -static __always_inline void evmcs_write16(unsigned long field, u16 value) {} -static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } -static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } -static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } -static inline void evmcs_load(u64 phys_addr) {} -#endif /* IS_ENABLED(CONFIG_HYPERV) */ - #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cf19a3346639..552593a2ac14 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -66,6 +66,7 @@ #include "vmx.h" #include "x86.h" #include "smm.h" +#include "vmx_onhyperv.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.c b/arch/x86/kvm/vmx/vmx_onhyperv.c new file mode 100644 index 000000000000..b9a8b91166d0 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_onhyperv.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "capabilities.h" +#include "vmx_onhyperv.h" + +DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h new file mode 100644 index 000000000000..11541d272dbd --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__ +#define __ARCH_X86_KVM_VMX_ONHYPERV_H__ + +#include + +#include + +#include "capabilities.h" +#include "hyperv.h" +#include "vmcs12.h" + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#if IS_ENABLED(CONFIG_HYPERV) + +DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +static __always_inline bool kvm_is_using_evmcs(void) +{ + return static_branch_unlikely(&__kvm_is_using_evmcs); +} + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); + return offset; +} + +static __always_inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static __always_inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static __always_inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static inline void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static __always_inline bool kvm_is_using_evmcs(void) { return false; } +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write32(unsigned long field, u32 value) {} +static __always_inline void evmcs_write16(unsigned long field, u16 value) {} +static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } +static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } +static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +#endif /* __ARCH_X86_KVM_VMX_ONHYPERV_H__ */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 33af7b4c6eb4..f41ce3c24123 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -6,7 +6,7 @@ #include -#include "hyperv.h" +#include "vmx_onhyperv.h" #include "vmcs.h" #include "../x86.h" -- cgit v1.2.3 From 16e880bfa6377871da233c846ecdf23db2bf1d97 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:18 +0100 Subject: KVM: x86: Introduce helper to check if auto-EOI is set in Hyper-V SynIC As a preparation to making Hyper-V emulation optional, create a dedicated kvm_hv_synic_auto_eoi_set() helper to avoid extra ifdefs in lapic.c No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-5-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.h | 6 ++++++ arch/x86/kvm/lapic.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index f83b8db72b11..286956fe1a2e 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -105,6 +105,12 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); +static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector) +{ + return to_hv_vcpu(vcpu) && + test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap); +} + void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 245b20973cae..f7abc1008cad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2905,7 +2905,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) */ apic_clear_irr(vector, apic); - if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) { + if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another -- cgit v1.2.3 From 0659262a2625ca3e4061796cd4f4935091220056 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:19 +0100 Subject: KVM: x86: Introduce helper to check if vector is set in Hyper-V SynIC As a preparation to making Hyper-V emulation optional, create a dedicated kvm_hv_synic_has_vector() helper to avoid extra ifdefs in lapic.c. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-6-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.h | 5 +++++ arch/x86/kvm/lapic.c | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 286956fe1a2e..9d8fa6ba6341 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -105,6 +105,11 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); +static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) +{ + return to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->vec_bitmap); +} + static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector) { return to_hv_vcpu(vcpu) && diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f7abc1008cad..3242f3da2457 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1475,8 +1475,7 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (to_hv_vcpu(apic->vcpu) && - test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap)) + if (kvm_hv_synic_has_vector(apic->vcpu, vector)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); -- cgit v1.2.3 From e7ad84db4d718e18c7a133e941ba4c7d4c6d4cbf Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:20 +0100 Subject: KVM: VMX: Split off hyperv_evmcs.{ch} Some Enlightened VMCS related code is needed both by Hyper-V on KVM and KVM on Hyper-V. As a preparation to making Hyper-V emulation optional, create dedicated 'hyperv_evmcs.{ch}' files which are used by both. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-7-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/hyperv.c | 308 --------------------------------------- arch/x86/kvm/vmx/hyperv.h | 163 +-------------------- arch/x86/kvm/vmx/hyperv_evmcs.c | 315 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/hyperv_evmcs.h | 166 +++++++++++++++++++++ arch/x86/kvm/vmx/nested.h | 1 + arch/x86/kvm/vmx/vmx_onhyperv.h | 3 +- 7 files changed, 486 insertions(+), 472 deletions(-) create mode 100644 arch/x86/kvm/vmx/hyperv_evmcs.c create mode 100644 arch/x86/kvm/vmx/hyperv_evmcs.h (limited to 'arch') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a99ffc3f3a3f..8ea872401cd6 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -23,7 +23,7 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/hyperv.o vmx/nested.o vmx/posted_intr.o + vmx/hyperv.o vmx/hyperv_evmcs.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o ifdef CONFIG_HYPERV diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index de13dc14fe1d..fab6a1ad98dc 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,314 +13,6 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} - -const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - /* - * Not used by KVM: - * - * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x0000682A, guest_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1A, host_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - */ - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; -const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); - u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 9401dbfaea7c..d4ed99008518 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -2,170 +2,9 @@ #ifndef __KVM_X86_VMX_HYPERV_H #define __KVM_X86_VMX_HYPERV_H -#include - -#include -#include -#include - -#include "../hyperv.h" - -#include "capabilities.h" -#include "vmcs.h" +#include #include "vmcs12.h" -#define KVM_EVMCS_VERSION 1 - -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_ENABLE_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - -struct evmcs_field { - u16 offset; - u16 clean_field; -}; - -extern const struct evmcs_field vmcs_field_to_evmcs_1[]; -extern const unsigned int nr_evmcs_1_fields; - -static __always_inline int evmcs_field_offset(unsigned long field, - u16 *clean_field) -{ - unsigned int index = ROL16(field, 6); - const struct evmcs_field *evmcs_field; - - if (unlikely(index >= nr_evmcs_1_fields)) - return -ENOENT; - - evmcs_field = &vmcs_field_to_evmcs_1[index]; - - /* - * Use offset=0 to detect holes in eVMCS. This offset belongs to - * 'revision_id' but this field has no encoding and is supposed to - * be accessed directly. - */ - if (unlikely(!evmcs_field->offset)) - return -ENOENT; - - if (clean_field) - *clean_field = evmcs_field->clean_field; - - return evmcs_field->offset; -} - -static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, - unsigned long field, u16 offset) -{ - /* - * vmcs12_read_any() doesn't care whether the supplied structure - * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes - * the exact offset of the required field, use it for convenience - * here. - */ - return vmcs12_read_any((void *)evmcs, field, offset); -} - #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c new file mode 100644 index 000000000000..904bfcd1519b --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common code for working with Enlightened VMCS which is + * used both by Hyper-V on KVM and KVM on Hyper-V. + */ + +#include "hyperv_evmcs.h" + +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + /* + * Not used by KVM: + * + * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x0000682A, guest_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1A, host_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + */ + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; +const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h new file mode 100644 index 000000000000..a543fccfc574 --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file contains common definitions for working with Enlightened VMCS which + * are used both by Hyper-V on KVM and KVM on Hyper-V. + */ +#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H +#define __KVM_X86_VMX_HYPERV_EVMCS_H + +#include + +#include "capabilities.h" +#include "vmcs12.h" + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +extern const struct evmcs_field vmcs_field_to_evmcs_1[]; +extern const unsigned int nr_evmcs_1_fields; + +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) +{ + const struct evmcs_field *evmcs_field; + unsigned int index = ROL16(field, 6); + + if (unlikely(index >= nr_evmcs_1_fields)) + return -ENOENT; + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b4b9d51438c6..b0f2e26c1aea 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -3,6 +3,7 @@ #define __KVM_X86_VMX_NESTED_H #include "kvm_cache_regs.h" +#include "hyperv.h" #include "vmcs12.h" #include "vmx.h" diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index 11541d272dbd..eb48153bfd73 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -4,11 +4,12 @@ #define __ARCH_X86_KVM_VMX_ONHYPERV_H__ #include +#include #include #include "capabilities.h" -#include "hyperv.h" +#include "hyperv_evmcs.h" #include "vmcs12.h" #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) -- cgit v1.2.3 From af9d544a452114eb54638015544b884e1befd0fb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:21 +0100 Subject: KVM: x86: Introduce helper to handle Hyper-V paravirt TLB flush requests As a preparation to making Hyper-V emulation optional, introduce a helper to handle pending KVM_REQ_HV_TLB_FLUSH requests. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-8-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.h | 13 +++++++++++++ arch/x86/kvm/svm/nested.c | 10 ++-------- arch/x86/kvm/vmx/nested.c | 10 ++-------- 3 files changed, 17 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9d8fa6ba6341..77f6549aa5de 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -247,6 +247,19 @@ static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) return kvm_hv_get_assist_page(vcpu); } +static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, + bool tdp_enabled) +{ + /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && tdp_enabled) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); +} + int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3fea8c47679e..74c04102ef01 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -487,14 +487,8 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { - /* - * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or - * L2's VP_ID upon request from the guest. Make sure we check for - * pending entries in the right FIFO upon L1/L2 transition as these - * requests are put by other vCPUs asynchronously. - */ - if (to_hv_vcpu(vcpu) && npt_enabled) - kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* Handle pending Hyper-V TLB flush requests */ + kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled); /* * TODO: optimize unconditional TLB flush/MMU sync. A partial list of diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c5ec0ef51ff7..382c0746d069 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1139,14 +1139,8 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or - * L2's VP_ID upon request from the guest. Make sure we check for - * pending entries in the right FIFO upon L1/L2 transition as these - * requests are put by other vCPUs asynchronously. - */ - if (to_hv_vcpu(vcpu) && enable_ept) - kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* Handle pending Hyper-V TLB flush requests */ + kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings -- cgit v1.2.3 From b2e02f82b7f76234305c5a7fba4dbebc47ce4cb5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:22 +0100 Subject: KVM: nVMX: Split off helper for emulating VMCLEAR on Hyper-V eVMCS To avoid overloading handle_vmclear() with Hyper-V specific details and to prepare the code to making Hyper-V emulation optional, create a dedicated nested_evmcs_handle_vmclear() helper. No functional change intended. Suggested-by: Sean Christopherson Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20231205103630.1391318-9-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 382c0746d069..903b6f9ea2bd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -243,6 +243,29 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) } } +static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (!guest_cpuid_has_evmcs(vcpu) || + !evmptr_is_valid(nested_get_evmptr(vcpu))) + return false; + + if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + + return true; +} + static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, struct loaded_vmcs *prev) { @@ -5286,18 +5309,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - /* - * When Enlightened VMEntry is enabled on the calling CPU we treat - * memory area pointer by vmptr as Enlightened VMCS (as there's no good - * way to distinguish it from VMCS12) and we must not corrupt it by - * writing to the non-existent 'launch_state' field. The area doesn't - * have to be the currently active EVMCS on the calling CPU and there's - * nothing KVM has to do to transition it from 'active' to 'non-active' - * state. It is possible that the area will stay mapped as - * vmx->nested.hv_evmcs but this shouldn't be a problem. - */ - if (likely(!guest_cpuid_has_evmcs(vcpu) || - !evmptr_is_valid(nested_get_evmptr(vcpu)))) { + if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -5314,8 +5326,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { - nested_release_evmcs(vcpu); } return nested_vmx_succeed(vcpu); -- cgit v1.2.3 From f97314626734deaef49564a429c6f8eee3846bd3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:25 +0100 Subject: KVM: nVMX: Move guest_cpuid_has_evmcs() to hyperv.h In preparation for making Hyper-V emulation optional, move Hyper-V specific guest_cpuid_has_evmcs() to hyperv.h. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Link: https://lore.kernel.org/r/20231205103630.1391318-12-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 11 +++++++++++ arch/x86/kvm/vmx/vmx.h | 10 ---------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index d4ed99008518..6e1ee951e360 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -4,6 +4,7 @@ #include #include "vmcs12.h" +#include "vmx.h" #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) @@ -20,6 +21,16 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; +static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +{ + /* + * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and + * eVMCS has been explicitly enabled by userspace. + */ + return vcpu->arch.hyperv_enabled && + to_vmx(vcpu)->nested.enlightened_vmcs_enabled; +} + u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c2130d2c8e24..959c6d94287f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -745,14 +745,4 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) -{ - /* - * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and - * eVMCS has been explicitly enabled by userspace. - */ - return vcpu->arch.hyperv_enabled && - to_vmx(vcpu)->nested.enlightened_vmcs_enabled; -} - #endif /* __KVM_X86_VMX_H */ -- cgit v1.2.3 From b4f69df0f65e97fec439130a0d0a8b9c7cc02df2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:26 +0100 Subject: KVM: x86: Make Hyper-V emulation optional Hyper-V emulation in KVM is a fairly big chunk and in some cases it may be desirable to not compile it in to reduce module sizes as well as the attack surface. Introduce CONFIG_KVM_HYPERV option to make it possible. Note, there's room for further nVMX/nSVM code optimizations when !CONFIG_KVM_HYPERV, this will be done in follow-up patches. Reorganize Makefile a bit so all CONFIG_HYPERV and CONFIG_KVM_HYPERV files are grouped together. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Link: https://lore.kernel.org/r/20231205103630.1391318-13-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 6 ++++ arch/x86/kvm/Kconfig | 14 ++++++++++ arch/x86/kvm/Makefile | 20 ++++++------- arch/x86/kvm/cpuid.c | 6 ++++ arch/x86/kvm/hyperv.h | 61 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/irq_comm.c | 9 +++++- arch/x86/kvm/svm/hyperv.h | 9 ++++++ arch/x86/kvm/vmx/hyperv.h | 17 +++++++---- arch/x86/kvm/vmx/nested.c | 30 ++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++++----------- 11 files changed, 201 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 86069b985d22..b093c2191cd3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -937,8 +937,10 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; +#ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#endif #ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; #endif @@ -1095,6 +1097,7 @@ enum hv_tsc_page_status { HV_TSC_PAGE_BROKEN, }; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -1127,6 +1130,7 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +#endif struct msr_bitmap_range { u32 flags; @@ -1349,7 +1353,9 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; +#ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; +#endif #ifdef CONFIG_KVM_XEN struct kvm_xen xen; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c1716e83d176..34f2f47cadf2 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -141,6 +141,20 @@ config KVM_SMM If unsure, say Y. +config KVM_HYPERV + bool "Support for Microsoft Hyper-V emulation" + depends on KVM + default y + help + Provides KVM support for emulating Microsoft Hyper-V. This allows KVM + to expose a subset of the paravirtualized interfaces defined in the + Hyper-V Hypervisor Top-Level Functional Specification (TLFS): + https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs + These interfaces are required for the correct and performant functioning + of Windows and Hyper-V guests on KVM. + + If unsure, say "Y". + config KVM_XEN bool "Support for Xen hypercall interface" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 8ea872401cd6..475b5fa917a6 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -11,29 +11,27 @@ include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o -ifdef CONFIG_HYPERV -kvm-y += kvm_onhyperv.o -endif - kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_HYPERV) += hyperv.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/hyperv.o vmx/hyperv_evmcs.o vmx/nested.o vmx/posted_intr.o -kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o + vmx/nested.o vmx/posted_intr.o -ifdef CONFIG_HYPERV -kvm-intel-y += vmx/vmx_onhyperv.o -endif +kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o +kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ - svm/sev.o svm/hyperv.o + svm/sev.o +kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o ifdef CONFIG_HYPERV +kvm-y += kvm_onhyperv.o +kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm_onhyperv.o endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..1b278a3f0689 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -314,11 +314,15 @@ EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) { +#ifdef CONFIG_KVM_HYPERV struct kvm_cpuid_entry2 *entry; entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, KVM_CPUID_INDEX_NOT_SIGNIFICANT); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; +#else + return false; +#endif } static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -433,11 +437,13 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, return 0; } +#ifdef CONFIG_KVM_HYPERV if (kvm_cpuid_has_hyperv(e2, nent)) { r = kvm_hv_vcpu_init(vcpu); if (r) return r; } +#endif r = kvm_check_cpuid(vcpu, e2, nent); if (r) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 77f6549aa5de..1dc0b6604526 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -24,6 +24,8 @@ #include #include "x86.h" +#ifdef CONFIG_KVM_HYPERV + /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -261,5 +263,62 @@ static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); +#else /* CONFIG_KVM_HYPERV */ +static inline void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock) {} +static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {} +static inline void kvm_hv_init_vm(struct kvm *kvm) {} +static inline void kvm_hv_destroy_vm(struct kvm *kvm) {} +static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + return HV_STATUS_ACCESS_DENIED; +} +static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} +static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} +static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) +{ + return false; +} +static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector) +{ + return false; +} +static inline void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) {} +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) {} +static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) +{ + return vcpu->vcpu_idx; +} +static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {} +#endif /* CONFIG_KVM_HYPERV */ -#endif +#endif /* __ARCH_X86_KVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 16d076a1b91a..68f3f6c26046 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -144,7 +144,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } - +#ifdef CONFIG_KVM_HYPERV static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -154,6 +154,7 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } +#endif int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -163,9 +164,11 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int r; switch (e->type) { +#ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: return kvm_hv_set_sint(e, kvm, irq_source_id, level, line_status); +#endif case KVM_IRQ_ROUTING_MSI: if (kvm_msi_route_invalid(kvm, e)) @@ -314,11 +317,13 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_msi_route_invalid(kvm, e)) return -EINVAL; break; +#ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#endif #ifdef CONFIG_KVM_XEN case KVM_IRQ_ROUTING_XEN_EVTCHN: return kvm_xen_setup_evtchn(kvm, e, ue); @@ -438,5 +443,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, void kvm_arch_irq_routing_update(struct kvm *kvm) { +#ifdef CONFIG_KVM_HYPERV kvm_hv_irq_routing_update(kvm); +#endif } diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index 02f4784b5d44..d3f8bfc05832 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -11,6 +11,7 @@ #include "../hyperv.h" #include "svm.h" +#ifdef CONFIG_KVM_HYPERV static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -41,5 +42,13 @@ static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) } void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); +#else /* CONFIG_KVM_HYPERV */ +static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {} +static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) {} +#endif /* CONFIG_KVM_HYPERV */ #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 6e1ee951e360..0e90ef4efe34 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -9,11 +9,6 @@ #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) -static inline bool evmptr_is_valid(u64 evmptr) -{ - return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; -} - enum nested_evmptrld_status { EVMPTRLD_DISABLED, EVMPTRLD_SUCCEEDED, @@ -21,6 +16,12 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; +#ifdef CONFIG_KVM_HYPERV +static inline bool evmptr_is_valid(u64 evmptr) +{ + return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; +} + static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) { /* @@ -39,5 +40,11 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * int nested_evmcs_check_controls(struct vmcs12 *vmcs12); bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); +#else +static inline bool evmptr_is_valid(u64 evmptr) +{ + return false; +} +#endif #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 903b6f9ea2bd..01a94d290c12 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -226,6 +226,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -241,10 +242,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) hv_vcpu->nested.vm_id = 0; hv_vcpu->nested.vp_id = 0; } +#endif } static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) { +#ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); /* * When Enlightened VMEntry is enabled on the calling CPU we treat @@ -264,6 +267,9 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) nested_release_evmcs(vcpu); return true; +#else + return false; +#endif } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -1595,6 +1601,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); @@ -1835,10 +1842,14 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields */ return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -2009,6 +2020,9 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } /* @@ -2018,6 +2032,7 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( struct kvm_vcpu *vcpu, bool from_launch) { +#ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; @@ -2099,6 +2114,9 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } return EVMPTRLD_SUCCEEDED; +#else + return EVMPTRLD_DISABLED; +#endif } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) @@ -2905,8 +2923,10 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; +#ifdef CONFIG_KVM_HYPERV if (guest_cpuid_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); +#endif return 0; } @@ -3178,6 +3198,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3205,6 +3226,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) return true; } +#endif static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { @@ -3296,6 +3318,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV /* * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory @@ -3312,6 +3335,7 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) return false; } +#endif if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; @@ -4749,6 +4773,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map @@ -4758,6 +4783,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, */ (void)nested_get_evmcs_page(vcpu); } +#endif /* Service pending TLB flush requests for L2 before switching to L1. */ kvm_service_local_tlb_flush_requests(vcpu); @@ -6212,11 +6238,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; +#ifdef CONFIG_KVM_HYPERV case EXIT_REASON_VMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && nested_evmcs_l2_tlb_flush_enabled(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu); +#endif default: break; } @@ -7100,7 +7128,9 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, +#ifdef CONFIG_KVM_HYPERV .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, +#endif }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 552593a2ac14..3ff5c44dff9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2048,6 +2048,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) return 1; +#ifdef CONFIG_KVM_HYPERV /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V @@ -2058,6 +2059,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); +#endif break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 81224b9676d9..598b057611e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1504,6 +1504,8 @@ static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, @@ -1521,6 +1523,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, @@ -4020,6 +4023,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4032,6 +4036,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. @@ -4377,6 +4382,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4390,6 +4396,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -4527,6 +4534,7 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +#ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) { @@ -4547,6 +4555,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, return 0; } +#endif static bool kvm_is_vm_type_supported(unsigned long type) { @@ -4580,9 +4589,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_HYPERV_TIME: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: @@ -4592,6 +4603,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: +#endif case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -4601,7 +4613,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: - case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4712,12 +4723,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops.nested_ops->get_state ? kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; +#endif case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; @@ -4884,9 +4897,11 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; +#endif case KVM_GET_DEVICE_ATTR: { struct kvm_device_attr attr; r = -EFAULT; @@ -5712,14 +5727,11 @@ static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { - int r; - uint16_t vmcs_version; - void __user *user_ptr; - if (cap->flags) return -EINVAL; switch (cap->cap) { +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; @@ -5731,16 +5743,22 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_ops->enable_evmcs) - return -ENOTTY; - r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); - if (!r) { - user_ptr = (void __user *)(uintptr_t)cap->args[0]; - if (copy_to_user(user_ptr, &vmcs_version, - sizeof(vmcs_version))) - r = -EFAULT; + { + int r; + uint16_t vmcs_version; + void __user *user_ptr; + + if (!kvm_x86_ops.nested_ops->enable_evmcs) + return -ENOTTY; + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); + if (!r) { + user_ptr = (void __user *)(uintptr_t)cap->args[0]; + if (copy_to_user(user_ptr, &vmcs_version, + sizeof(vmcs_version))) + r = -EFAULT; + } + return r; } - return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; @@ -5749,6 +5767,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); +#endif case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; @@ -6141,9 +6160,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; +#endif #ifdef CONFIG_KVM_XEN case KVM_XEN_VCPU_GET_ATTR: { struct kvm_xen_vcpu_attr xva; @@ -7201,6 +7222,7 @@ set_pit2_out: r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_HYPERV_EVENTFD: { struct kvm_hyperv_eventfd hvevfd; @@ -7210,6 +7232,7 @@ set_pit2_out: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } +#endif case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; @@ -10588,19 +10611,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; +#ifdef CONFIG_KVM_HYPERV if (to_hv_vcpu(vcpu)) { + u64 eoi_exit_bitmap[4]; + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } - +#endif static_call_cond(kvm_x86_load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } @@ -10691,9 +10715,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * the flushes are considered "remote" and not "local" because * the requests can be initiated from other vCPUs. */ +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && kvm_hv_vcpu_flush_tlb(vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); +#endif if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -10746,6 +10772,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_load_eoi_exitmap(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; @@ -10776,6 +10803,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); +#endif if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) -- cgit v1.2.3 From 453e42b0557148cc7092c72eff6677a5436e3c7c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:27 +0100 Subject: KVM: nVMX: Introduce helpers to check if Hyper-V evmptr12 is valid/set In order to get rid of raw 'vmx->nested.hv_evmcs_vmptr' accesses when !CONFIG_KVM_HYPERV, introduce a pair of helpers: nested_vmx_is_evmptr12_valid() to check that eVMPTR points to a valid address. nested_vmx_is_evmptr12_valid() to check that eVMPTR either points to a valid address or is in 'pending' port-migration state (meaning it is supposed to be valid but the exact address wasn't acquired from guest's memory yet). No functional change intended. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Link: https://lore.kernel.org/r/20231205103630.1391318-14-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 30 ++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/nested.c | 38 +++++++++++++++++++------------------- arch/x86/kvm/vmx/nested.h | 2 +- 3 files changed, 50 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 0e90ef4efe34..71e90a16f183 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -22,6 +22,21 @@ static inline bool evmptr_is_valid(u64 evmptr) return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; } +static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) +{ + return evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); +} + +static inline bool evmptr_is_set(u64 evmptr) +{ + return evmptr != EVMPTR_INVALID; +} + +static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) +{ + return evmptr_is_set(vmx->nested.hv_evmcs_vmptr); +} + static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) { /* @@ -45,6 +60,21 @@ static inline bool evmptr_is_valid(u64 evmptr) { return false; } + +static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) +{ + return false; +} + +static inline bool evmptr_is_set(u64 evmptr) +{ + return false; +} + +static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) +{ + return false; +} #endif #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 01a94d290c12..0507174750e0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -179,7 +179,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced. */ - if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; return kvm_skip_emulated_instruction(vcpu); @@ -194,7 +194,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * can't be done if there isn't a current VMCS. */ if (vmx->nested.current_vmptr == INVALID_GPA && - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + !nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); @@ -230,7 +230,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (nested_vmx_is_evmptr12_valid(vmx)) { kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); vmx->nested.hv_evmcs = NULL; } @@ -2123,7 +2123,7 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) copy_vmcs12_to_enlightened(vmx); else copy_vmcs12_to_shadow(vmx); @@ -2277,7 +2277,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) prepare_vmcs02_early_rare(vmx, vmcs12); /* @@ -2572,11 +2572,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, struct vcpu_vmx *vmx = to_vmx(vcpu); bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || + load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || !(vmx->nested.hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } @@ -2699,7 +2699,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bits when it changes a field in eVMCS. Mark all fields as clean * here. */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.hv_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; @@ -3579,7 +3579,7 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; - if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } @@ -3610,7 +3610,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); - if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + if (CC(!nested_vmx_is_evmptr12_valid(vmx) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); @@ -3625,7 +3625,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (nested_vmx_is_evmptr12_valid(vmx)) { copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; @@ -4370,11 +4370,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); + !nested_vmx_is_evmptr12_valid(vmx); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -4897,7 +4897,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } if ((vm_exit_reason != -1) && - (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ @@ -5390,7 +5390,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. @@ -5616,7 +5616,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) return 1; if (vmx->nested.current_vmptr != vmptr) { @@ -5679,7 +5679,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) + if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, @@ -6467,7 +6467,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ - if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(vmx)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6523,7 +6523,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b0f2e26c1aea..cce4e2aa30fb 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -58,7 +58,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ return vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID; + nested_vmx_is_evmptr12_set(vmx); } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From c98842b26c233318bb18c77cc6e25859fe76c80e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:28 +0100 Subject: KVM: nVMX: Introduce accessor to get Hyper-V eVMCS pointer There's a number of 'vmx->nested.hv_evmcs' accesses in nested.c, introduce 'nested_vmx_evmcs()' accessor to hide them all in !CONFIG_KVM_HYPERV case. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-15-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 10 ++++++++++ arch/x86/kvm/vmx/nested.c | 33 ++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 71e90a16f183..a87407412615 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -37,6 +37,11 @@ static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) return evmptr_is_set(vmx->nested.hv_evmcs_vmptr); } +static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) +{ + return vmx->nested.hv_evmcs; +} + static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) { /* @@ -75,6 +80,11 @@ static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) { return false; } + +static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) +{ + return NULL; +} #endif #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0507174750e0..4e872863a0c9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -263,7 +263,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; - if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) + if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); return true; @@ -601,7 +601,6 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ @@ -617,10 +616,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2. */ - if (!vmx->nested.force_msr_bitmap_recalc && evmcs && - evmcs->hv_enlightenments_control.msr_bitmap && - evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) - return true; + if (!vmx->nested.force_msr_bitmap_recalc) { + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + } if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; @@ -1603,7 +1605,7 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ @@ -1851,7 +1853,7 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); /* * Should not be changed by KVM: @@ -2438,7 +2440,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { @@ -2570,6 +2572,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false; if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { @@ -2577,8 +2580,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx->nested.dirty_vmcs12 = false; load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || - !(vmx->nested.hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && @@ -2700,8 +2702,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * here. */ if (nested_vmx_is_evmptr12_valid(vmx)) - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; return 0; } @@ -3626,7 +3627,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return nested_vmx_failInvalid(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) { - copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { @@ -5428,7 +5431,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* Read the field, zero-extended to a u64 value */ - value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); } /* -- cgit v1.2.3 From 5a30f97683af802c0fa24d1cfc339f87cdb6791b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:29 +0100 Subject: KVM: nVMX: Hide more stuff under CONFIG_KVM_HYPERV 'hv_evmcs_vmptr'/'hv_evmcs_map'/'hv_evmcs' fields in 'struct nested_vmx' should not be used when !CONFIG_KVM_HYPERV, hide them when !CONFIG_KVM_HYPERV. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-16-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 3 +++ arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4e872863a0c9..cf47b8b7f40f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6667,6 +6667,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); +#ifdef CONFIG_KVM_HYPERV } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called @@ -6676,6 +6677,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, */ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); +#endif } else { return -EINVAL; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3ff5c44dff9d..a26603ddc968 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4828,7 +4828,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->nested.posted_intr_nv = -1; vmx->nested.vmxon_ptr = INVALID_GPA; vmx->nested.current_vmptr = INVALID_GPA; + +#ifdef CONFIG_KVM_HYPERV vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; +#endif vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 959c6d94287f..8fe6eb2b4a34 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -241,9 +241,11 @@ struct nested_vmx { bool guest_mode; } smm; +#ifdef CONFIG_KVM_HYPERV gpa_t hv_evmcs_vmptr; struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; +#endif }; struct vcpu_vmx { -- cgit v1.2.3 From 017a99a966f1183e611f0b0fa6bec40160c81813 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 5 Dec 2023 11:36:30 +0100 Subject: KVM: nSVM: Hide more stuff under CONFIG_KVM_HYPERV/CONFIG_HYPERV 'struct hv_vmcb_enlightenments' in VMCB only make sense when either CONFIG_KVM_HYPERV or CONFIG_HYPERV is enabled. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Jeremi Piotrowski Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20231205103630.1391318-17-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 20 ++++++++++++++------ arch/x86/kvm/svm/svm.h | 2 ++ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 74c04102ef01..20212aac050b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -187,7 +187,6 @@ void recalc_intercepts(struct vcpu_svm *svm) */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; int i; /* @@ -198,11 +197,16 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) * - Nested hypervisor (L1) is using Hyper-V emulation interface and * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - if (!svm->nested.force_msr_bitmap_recalc && - kvm_hv_hypercall_enabled(&svm->vcpu) && - hve->hv_enlightenments_control.msr_bitmap && - (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) - goto set_msrpm_base_pa; +#ifdef CONFIG_KVM_HYPERV + if (!svm->nested.force_msr_bitmap_recalc) { + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + + if (kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; + } +#endif if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; @@ -230,7 +234,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.force_msr_bitmap_recalc = false; +#ifdef CONFIG_KVM_HYPERV set_msrpm_base_pa: +#endif svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -378,12 +384,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->msrpm_base_pa &= ~0x0fffULL; to->iopm_base_pa &= ~0x0fffULL; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V extensions (Enlightened VMCB) */ if (kvm_hv_hypercall_enabled(vcpu)) { to->clean = from->clean; memcpy(&to->hv_enlightenments, &from->hv_enlightenments, sizeof(to->hv_enlightenments)); } +#endif } void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be67ab7fdd10..59adff7bbf55 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -148,7 +148,9 @@ struct vmcb_ctrl_area_cached { u64 virt_ext; u32 clean; union { +#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; +#endif u8 reserved_sw[32]; }; }; -- cgit v1.2.3 From 6d72283526090850274d065cd5d60af732cc5fc8 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 2 Nov 2023 16:21:28 +0000 Subject: KVM x86/xen: add an override for PVCLOCK_TSC_STABLE_BIT Unless explicitly told to do so (by passing 'clocksource=tsc' and 'tsc=stable:socket', and then jumping through some hoops concerning potential CPU hotplug) Xen will never use TSC as its clocksource. Hence, by default, a Xen guest will not see PVCLOCK_TSC_STABLE_BIT set in either the primary or secondary pvclock memory areas. This has led to bugs in some guest kernels which only become evident if PVCLOCK_TSC_STABLE_BIT *is* set in the pvclocks. Hence, to support such guests, give the VMM a new Xen HVM config flag to tell KVM to forcibly clear the bit in the Xen pvclocks. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20231102162128.2353459-1-paul@xen.org Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++----- arch/x86/kvm/xen.c | 9 ++++++++- include/uapi/linux/kvm.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 926241e23aeb..dca83c65d97f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8562,6 +8562,7 @@ PVHVM guests. Valid flags are:: #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) + #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG ioctl is available, for the guest to set its hypercall page. @@ -8605,6 +8606,11 @@ behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless specifically enabled (by the guest making the hypercall, causing the VMM to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute). +The KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag indicates that KVM supports +clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be +done when the KVM_CAP_XEN_HVM ioctl sets the +KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag. + 8.31 KVM_CAP_PPC_MULTITCE ------------------------- diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..aa7cea9600b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3104,7 +3104,8 @@ u64 get_kvmclock_ns(struct kvm *kvm) static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, struct gfn_to_pfn_cache *gpc, - unsigned int offset) + unsigned int offset, + bool force_tsc_unstable) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; @@ -3141,6 +3142,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, } memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); + + if (force_tsc_unstable) + guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; + smp_wmb(); guest_hv_clock->version = ++vcpu->hv_clock.version; @@ -3161,6 +3166,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) u64 tsc_timestamp, host_tsc; u8 pvclock_flags; bool use_master_clock; +#ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + */ + bool xen_pvclock_tsc_unstable = + ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; +#endif kernel_ns = 0; host_tsc = 0; @@ -3239,13 +3254,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time)); + offsetof(struct compat_vcpu_info, time), + xen_pvclock_tsc_unstable); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, + xen_pvclock_tsc_unstable); #endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; @@ -4646,7 +4663,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO | KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | - KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + KVM_XEN_HVM_CONFIG_EVTCHN_SEND | + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index e53fad915a62..e43948b87f94 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1162,7 +1162,9 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) { /* Only some feature flags need to be *enabled* by userspace */ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + KVM_XEN_HVM_CONFIG_EVTCHN_SEND | + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; + u32 old_flags; if (xhc->flags & ~permitted_flags) return -EINVAL; @@ -1183,9 +1185,14 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); + old_flags = kvm->arch.xen_hvm_config.flags; memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); mutex_unlock(&kvm->arch.xen.xen_lock); + + if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + return 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e9cb2df67a1d..175420b26e36 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1318,6 +1318,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) struct kvm_xen_hvm_config { __u32 flags; -- cgit v1.2.3 From 8132d887a7023b212f242a51ae89281c69fde996 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Oct 2023 12:11:56 -0400 Subject: KVM: remove CONFIG_HAVE_KVM_EVENTFD virt/kvm/eventfd.c is compiled unconditionally, meaning that the ioeventfds member of struct kvm is accessed unconditionally. CONFIG_HAVE_KVM_EVENTFD therefore must be defined for KVM common code to compile successfully, remove it. Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 1 - arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/Kconfig | 1 - include/linux/kvm_host.h | 30 +----------------------------- virt/kvm/Kconfig | 5 +---- 9 files changed, 2 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 1a777715199f..87bd96c1254f 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -29,7 +29,6 @@ menuconfig KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_XFER_TO_GUEST_WORK select KVM_VFIO - select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index f22bae89b07d..daba4cd5e87d 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM depends on AS_HAS_LVZ_EXTENSION depends on HAVE_KVM select HAVE_KVM_DIRTY_RING_ACQ_REL - select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index c04987d2ed2e..428141b0b48f 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select PREEMPT_NOTIFIERS select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b33358ee6424..0f4e6e7ba35d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select PREEMPT_NOTIFIERS - select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select IRQ_BYPASS_MANAGER diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index ae2e05f050ec..be264b78487d 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU - select HAVE_KVM_EVENTFD select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 45fdf2a9b2e3..ed567b858535 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select HAVE_KVM_IRQCHIP diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c1716e83d176..088c66bee98f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -33,7 +33,6 @@ config KVM select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea1523a7b83a..3fe5a6be7768 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -782,7 +782,6 @@ struct kvm { struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; -#ifdef CONFIG_HAVE_KVM_EVENTFD struct { spinlock_t lock; struct list_head items; @@ -791,7 +790,6 @@ struct kvm { struct mutex resampler_lock; } irqfds; struct list_head ioeventfds; -#endif struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; @@ -2056,8 +2054,6 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); -#ifdef CONFIG_HAVE_KVM_EVENTFD - void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); @@ -2082,31 +2078,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, { return false; } -#endif - -#else - -static inline void kvm_eventfd_init(struct kvm *kvm) {} - -static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) -{ - return -EINVAL; -} - -static inline void kvm_irqfd_release(struct kvm *kvm) {} - -#ifdef CONFIG_HAVE_KVM_IRQCHIP -static inline void kvm_irq_routing_update(struct kvm *kvm) -{ -} -#endif - -static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - return -ENOSYS; -} - -#endif /* CONFIG_HAVE_KVM_EVENTFD */ +#endif /* CONFIG_HAVE_KVM_IRQFD */ void kvm_arch_irq_routing_update(struct kvm *kvm); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 2c964586aa14..72e0cac864f9 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -3,6 +3,7 @@ config HAVE_KVM bool + select EVENTFD config HAVE_KVM_PFNCACHE bool @@ -39,10 +40,6 @@ config NEED_KVM_DIRTY_RING_WITH_BITMAP bool depends on HAVE_KVM_DIRTY_RING -config HAVE_KVM_EVENTFD - bool - select EVENTFD - config KVM_MMIO bool -- cgit v1.2.3 From c5b31cc2371728ddefe9baf1d036aeb630a25d96 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Oct 2023 12:07:32 -0400 Subject: KVM: remove CONFIG_HAVE_KVM_IRQFD All platforms with a kernel irqchip have support for irqfd. Unify the two configuration items so that userspace can expect to use irqfd to inject interrupts into the irqchip. Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 2 -- arch/powerpc/kvm/powerpc.c | 2 +- arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/Kconfig | 1 - include/linux/kvm_host.h | 9 ++++----- include/trace/events/kvm.h | 8 ++++---- virt/kvm/Kconfig | 3 --- virt/kvm/eventfd.c | 6 +++--- virt/kvm/kvm_main.c | 4 ++-- 11 files changed, 14 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 87bd96c1254f..b07c60c9737d 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -29,7 +29,6 @@ menuconfig KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_XFER_TO_GUEST_WORK select KVM_VFIO - select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP select HAVE_KVM_MSI diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 0f4e6e7ba35d..b47196085a42 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -224,7 +224,6 @@ config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && PPC_E500 select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI help @@ -237,7 +236,6 @@ config KVM_XICS bool "KVM in-kernel XICS emulation" depends on KVM_BOOK3S_64 && !KVM_MPIC select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD default y help Include support for the XICS (eXternal Interrupt Controller diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c39945a7fce3..c3f82e238b70 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -578,7 +578,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQFD_RESAMPLE: r = !xive_enabled(); break; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index be264b78487d..2b0b51035302 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI select HAVE_KVM_VCPU_ASYNC_IOCTL diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index ed567b858535..bb6d90351119 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -26,7 +26,6 @@ config KVM select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 088c66bee98f..b07247b0b958 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,7 +27,6 @@ config KVM select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE - select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_TSO select HAVE_KVM_DIRTY_RING_ACQ_REL select IRQ_BYPASS_MANAGER diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3fe5a6be7768..1bba24a13ec9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -805,8 +805,7 @@ struct kvm { * Update side is protected by irq_lock. */ struct kvm_irq_routing_table __rcu *irq_routing; -#endif -#ifdef CONFIG_HAVE_KVM_IRQFD + struct hlist_head irq_ack_notifier_list; #endif @@ -996,7 +995,7 @@ static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) } #endif -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else @@ -2057,7 +2056,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); bool kvm_notify_irqfd_resampler(struct kvm *kvm, @@ -2078,7 +2077,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, { return false; } -#endif /* CONFIG_HAVE_KVM_IRQFD */ +#endif /* CONFIG_HAVE_KVM_IRQCHIP */ void kvm_arch_irq_routing_update(struct kvm *kvm); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 3bd31ea23fee..011fba6b5552 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -62,7 +62,7 @@ TRACE_EVENT(kvm_vcpu_wakeup, __entry->valid ? "valid" : "invalid") ); -#if defined(CONFIG_HAVE_KVM_IRQFD) +#if defined(CONFIG_HAVE_KVM_IRQCHIP) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), @@ -82,7 +82,7 @@ TRACE_EVENT(kvm_set_irq, TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); -#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */ +#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ @@ -170,7 +170,7 @@ TRACE_EVENT(kvm_msi_set_irq, #endif /* defined(__KVM_HAVE_IOAPIC) */ -#if defined(CONFIG_HAVE_KVM_IRQFD) +#if defined(CONFIG_HAVE_KVM_IRQCHIP) #ifdef kvm_irqchips #define kvm_ack_irq_string "irqchip %s pin %u" @@ -197,7 +197,7 @@ TRACE_EVENT(kvm_ack_irq, TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm) ); -#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */ +#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 72e0cac864f9..6793211a0b64 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -11,9 +11,6 @@ config HAVE_KVM_PFNCACHE config HAVE_KVM_IRQCHIP bool -config HAVE_KVM_IRQFD - bool - config HAVE_KVM_IRQ_ROUTING bool diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 89912a17f5d5..19534156d48c 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -28,7 +28,7 @@ #include -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP static struct workqueue_struct *irqfd_cleanup_wq; @@ -531,7 +531,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, void kvm_eventfd_init(struct kvm *kvm) { -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); INIT_LIST_HEAD(&kvm->irqfds.resampler_list); @@ -540,7 +540,7 @@ kvm_eventfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->ioeventfds); } -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP /* * shutdown any irqfd's that match fd+gsi */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8758cb799e18..a20cf1f9ad29 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1273,7 +1273,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_disable; -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif @@ -4826,7 +4826,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_HAVE_KVM_MSI case KVM_CAP_SIGNAL_MSI: #endif -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQFD: #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: -- cgit v1.2.3 From 1565c881c3df053447309ff69ec7fd5dee2085e4 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:45 +0000 Subject: KVM: arm64: Explicitly trap unsupported HFGxTR_EL2 features Do not rely on the value of __HFGRTR_EL2_nMASK to trap unsupported features, since the nMASK can (and will) change as new traps are added and as its value is updated. Instead, explicitly specify the trap bits. Suggested-by: Joey Gouly Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-6-tabba@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f99d8af0b9af..7b4909dfd1f5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -110,12 +110,15 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) compute_clr_set(vcpu, HFGWTR_EL2, w_clr, w_set); } - /* The default is not to trap anything but ACCDATA_EL1 */ - r_val = __HFGRTR_EL2_nMASK & ~HFGxTR_EL2_nACCDATA_EL1; + /* The default to trap everything not handled or supported in KVM. */ + tmp = HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | + HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1; + + r_val = __HFGRTR_EL2_nMASK & ~tmp; r_val |= r_set; r_val &= ~r_clr; - w_val = __HFGWTR_EL2_nMASK & ~HFGxTR_EL2_nACCDATA_EL1; + w_val = __HFGWTR_EL2_nMASK & ~tmp; w_val |= w_set; w_val &= ~w_clr; -- cgit v1.2.3 From 9d400eb722bd1be712b007149ff1d8fb2d6470db Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:46 +0000 Subject: KVM: arm64: Add missing HFGxTR_EL2 FGT entries to nested virt Add the missing nested virt FGT table entries HFGxTR_EL2. Based on DDI0601 2023-09. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-7-tabba@google.com --- arch/arm64/kvm/emulate-nested.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 06185216a297..8b473a1bbc11 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1042,10 +1042,20 @@ enum fg_filter_id { static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ + SR_FGT(SYS_AMAIR2_EL1, HFGxTR, nAMAIR2_EL1, 0), + SR_FGT(SYS_MAIR2_EL1, HFGxTR, nMAIR2_EL1, 0), + SR_FGT(SYS_S2POR_EL1, HFGxTR, nS2POR_EL1, 0), + SR_FGT(SYS_POR_EL1, HFGxTR, nPOR_EL1, 0), + SR_FGT(SYS_POR_EL0, HFGxTR, nPOR_EL0, 0), SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0), SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0), + SR_FGT(SYS_RCWMASK_EL1, HFGxTR, nRCWMASK_EL1, 0), SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0), SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0), + SR_FGT(SYS_GCSCR_EL1, HFGxTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSPR_EL1, HFGxTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSCRE0_EL1, HFGxTR, nGCS_EL0, 0), + SR_FGT(SYS_GCSPR_EL0, HFGxTR, nGCS_EL0, 0), SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0), SR_FGT(SYS_ERXADDR_EL1, HFGxTR, ERXADDR_EL1, 1), SR_FGT(SYS_ERXPFGCDN_EL1, HFGxTR, ERXPFGCDN_EL1, 1), -- cgit v1.2.3 From 863ac38984a822ff9f4337d70853d771dcf7aae5 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:47 +0000 Subject: KVM: arm64: Add missing HFGITR_EL2 FGT entries to nested virt Add the missing nested virt FGT table entries HFGITR_EL2. Based on DDI0601 and DDI0602 2023-09. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-8-tabba@google.com --- arch/arm64/kvm/emulate-nested.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 8b473a1bbc11..89901550db34 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1117,6 +1117,11 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_AFSR1_EL1, HFGxTR, AFSR1_EL1, 1), SR_FGT(SYS_AFSR0_EL1, HFGxTR, AFSR0_EL1, 1), /* HFGITR_EL2 */ + SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), + SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), + SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0), + SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0), SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0), SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0), SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1), -- cgit v1.2.3 From f9d6ed0213021ea00af30efbfa33e9a06c0610f2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:48 +0000 Subject: KVM: arm64: Add bit masks for HAFGRTR_EL2 To support HAFGRTR_EL2 supported in nested virt in the following patch, first add its bitmask definitions based on DDI0601 2023-09. Reviewed-by: Mark Brown Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-9-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index b85f46a73e21..7de0a7062625 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -370,6 +370,10 @@ #define __HDFGWTR_EL2_MASK ~__HDFGWTR_EL2_nMASK #define __HDFGWTR_EL2_nMASK GENMASK(62, 60) +#define __HAFGRTR_EL2_RES0 (GENMASK(63, 50) | GENMASK(16, 5)) +#define __HAFGRTR_EL2_MASK (GENMASK(49, 17) | GENMASK(4, 0)) +#define __HAFGRTR_EL2_nMASK 0UL + /* Similar definitions for HCRX_EL2 */ #define __HCRX_EL2_RES0 (GENMASK(63, 16) | GENMASK(13, 12)) #define __HCRX_EL2_MASK (0) -- cgit v1.2.3 From 676f482354886caa9b0cfa9236f5d20ac78f8c6a Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:49 +0000 Subject: KVM: arm64: Handle HAFGRTR_EL2 trapping in nested virt Add the encodings to fine grain trapping fields for HAFGRTR_EL2 and add the associated handling code in nested virt. Based on DDI0601 2023-09. Add the missing field definitions as well, both to generate the correct RES0 mask and to be able to toggle their FGT bits. Also add the code for handling FGT trapping, reading of the register, to nested virt. Reviewed-by: Mark Brown Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-10-tabba@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/emulate-nested.c | 48 +++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 25 +++++++++++++++++ arch/arm64/kvm/sys_regs.c | 1 + 4 files changed, 75 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 824f29f04916..ba14648e2de2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -443,6 +443,7 @@ enum vcpu_sysreg { HFGITR_EL2, HDFGRTR_EL2, HDFGWTR_EL2, + HAFGRTR_EL2, CNTHP_CTL_EL2, CNTHP_CVAL_EL2, CNTHV_CTL_EL2, diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 89901550db34..431fd429932d 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1012,6 +1012,7 @@ enum fgt_group_id { HDFGRTR_GROUP, HDFGWTR_GROUP, HFGITR_GROUP, + HAFGRTR_GROUP, /* Must be last */ __NR_FGT_GROUP_IDS__ @@ -1689,6 +1690,49 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), + /* + * HAFGRTR_EL2 + */ + SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1), + SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1), + SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1), + SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1), + SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), + SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), }; static union trap_config get_trap_config(u32 sysreg) @@ -1909,6 +1953,10 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) val = sanitised_sys_reg(vcpu, HDFGWTR_EL2); break; + case HAFGRTR_GROUP: + val = sanitised_sys_reg(vcpu, HAFGRTR_EL2); + break; + case HFGITR_GROUP: val = sanitised_sys_reg(vcpu, HFGITR_EL2); switch (tc.fgf) { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 7b4909dfd1f5..0436f0da98a4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -80,6 +80,14 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } while(0) +static inline bool cpu_has_amu(void) +{ + u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_EL1_AMU_SHIFT); +} + static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -156,6 +164,20 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) write_sysreg_s(r_val, SYS_HDFGRTR_EL2); write_sysreg_s(w_val, SYS_HDFGWTR_EL2); + + if (!cpu_has_amu()) + return; + + ctxt_sys_reg(hctxt, HAFGRTR_EL2) = read_sysreg_s(SYS_HAFGRTR_EL2); + + r_clr = r_set = 0; + compute_clr_set(vcpu, HAFGRTR_EL2, r_clr, r_set); + + r_val = __HAFGRTR_EL2_nMASK; + r_val |= r_set; + r_val &= ~r_clr; + + write_sysreg_s(r_val, SYS_HAFGRTR_EL2); } static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) @@ -174,6 +196,9 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) write_sysreg_s(ctxt_sys_reg(hctxt, HFGITR_EL2), SYS_HFGITR_EL2); write_sysreg_s(ctxt_sys_reg(hctxt, HDFGRTR_EL2), SYS_HDFGRTR_EL2); write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); + + if (vcpu_has_amu()) + write_sysreg_s(ctxt_sys_reg(hctxt, HAFGRTR_EL2), SYS_HAFGRTR_EL2); } static inline void __activate_traps_common(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4735e1b37fb3..8bb297a2df38 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2532,6 +2532,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, EL2_REG(HDFGRTR_EL2, access_rw, reset_val, 0), EL2_REG(HDFGWTR_EL2, access_rw, reset_val, 0), + EL2_REG(HAFGRTR_EL2, access_rw, reset_val, 0), EL2_REG(SPSR_EL2, access_rw, reset_val, 0), EL2_REG(ELR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_SP_EL1), access_sp_el1}, -- cgit v1.2.3 From fc04838f9c00fcbc90a8926bbd46928d6fb36477 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:50 +0000 Subject: KVM: arm64: Update and fix FGT register masks New trap bits have been defined since the latest update to this patch. Moreover, the existing definitions of some of the mask and the RES0 bits overlap, which could be wrong, confusing, or both. Update the bits based on DDI0601 2023-09, and ensure that the existing bits are consistent. Subsequent patches will use the generated RES0 fields instead of specifying them manually. This patch keeps the manual encoding of the bits to make it easier to review the series. Fixes: 0fd76865006d ("KVM: arm64: Add nPIR{E0}_EL1 to HFG traps") Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-11-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 7de0a7062625..b0dc3249d5cd 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -344,30 +344,39 @@ * Once we get to a point where the two describe the same thing, we'll * merge the definitions. One day. */ -#define __HFGRTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51)) +#define __HFGRTR_EL2_RES0 BIT(51) #define __HFGRTR_EL2_MASK GENMASK(49, 0) -#define __HFGRTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) +#define __HFGRTR_EL2_nMASK (GENMASK(63, 52) | BIT(50)) -#define __HFGWTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51) | \ - BIT(46) | BIT(42) | BIT(40) | BIT(28) | \ - GENMASK(26, 25) | BIT(21) | BIT(18) | \ +#define __HFGWTR_EL2_RES0 (BIT(51) | BIT(46) | BIT(42) | BIT(40) | \ + BIT(28) | GENMASK(26, 25) | BIT(21) | BIT(18) | \ GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) -#define __HFGWTR_EL2_MASK GENMASK(49, 0) -#define __HFGWTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) +#define __HFGWTR_EL2_MASK (GENMASK(49, 47) | GENMASK(45, 43) | \ + BIT(41) | GENMASK(39, 29) | BIT(27) | \ + GENMASK(24, 22) | GENMASK(20, 19) | \ + GENMASK(17, 16) | GENMASK(13, 11) | \ + GENMASK(8, 3) | GENMASK(1, 0)) +#define __HFGWTR_EL2_nMASK (GENMASK(63, 52) | BIT(50)) -#define __HFGITR_EL2_RES0 GENMASK(63, 57) -#define __HFGITR_EL2_MASK GENMASK(54, 0) -#define __HFGITR_EL2_nMASK GENMASK(56, 55) +#define __HFGITR_EL2_RES0 (BIT(63) | BIT(61)) +#define __HFGITR_EL2_MASK (BIT(62) | BIT(60) | GENMASK(54, 0)) +#define __HFGITR_EL2_nMASK GENMASK(59, 55) #define __HDFGRTR_EL2_RES0 (BIT(49) | BIT(42) | GENMASK(39, 38) | \ GENMASK(21, 20) | BIT(8)) -#define __HDFGRTR_EL2_MASK ~__HDFGRTR_EL2_nMASK +#define __HDFGRTR_EL2_MASK (BIT(63) | GENMASK(58, 50) | GENMASK(48, 43) | \ + GENMASK(41, 40) | GENMASK(37, 22) | \ + GENMASK(19, 9) | GENMASK(7, 0)) #define __HDFGRTR_EL2_nMASK GENMASK(62, 59) #define __HDFGWTR_EL2_RES0 (BIT(63) | GENMASK(59, 58) | BIT(51) | BIT(47) | \ BIT(43) | GENMASK(40, 38) | BIT(34) | BIT(30) | \ BIT(22) | BIT(9) | BIT(6)) -#define __HDFGWTR_EL2_MASK ~__HDFGWTR_EL2_nMASK +#define __HDFGWTR_EL2_MASK (GENMASK(57, 52) | GENMASK(50, 48) | \ + GENMASK(46, 44) | GENMASK(42, 41) | \ + GENMASK(37, 35) | GENMASK(33, 31) | \ + GENMASK(29, 23) | GENMASK(21, 10) | \ + GENMASK(8, 7) | GENMASK(5, 0)) #define __HDFGWTR_EL2_nMASK GENMASK(62, 60) #define __HAFGRTR_EL2_RES0 (GENMASK(63, 50) | GENMASK(16, 5)) @@ -375,9 +384,9 @@ #define __HAFGRTR_EL2_nMASK 0UL /* Similar definitions for HCRX_EL2 */ -#define __HCRX_EL2_RES0 (GENMASK(63, 16) | GENMASK(13, 12)) -#define __HCRX_EL2_MASK (0) -#define __HCRX_EL2_nMASK (GENMASK(15, 14) | GENMASK(4, 0)) +#define __HCRX_EL2_RES0 (GENMASK(63, 25) | GENMASK(13, 12)) +#define __HCRX_EL2_MASK (BIT(6)) +#define __HCRX_EL2_nMASK (GENMASK(24, 14) | GENMASK(11, 7) | GENMASK(5, 0)) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) -- cgit v1.2.3 From 6c4abbea6d9c09df448b43624074a208c38e68e0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:51 +0000 Subject: KVM: arm64: Add build validation for FGT trap mask values These checks help ensure that all the bits are accounted for, that there hasn't been a transcribing error from the spec nor from the generated mask values, which will be used in subsequent patches. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-12-tabba@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 0436f0da98a4..ecccf99619e6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -79,6 +79,16 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) clr |= ~hfg & __ ## reg ## _nMASK; \ } while(0) +/* + * Validate the fine grain trap masks. + * Check that the masks do not overlap and that all bits are accounted for. + */ +#define CHECK_FGT_MASKS(reg) \ + do { \ + BUILD_BUG_ON((__ ## reg ## _MASK) & (__ ## reg ## _nMASK)); \ + BUILD_BUG_ON(~((__ ## reg ## _RES0) ^ (__ ## reg ## _MASK) ^ \ + (__ ## reg ## _nMASK))); \ + } while(0) static inline bool cpu_has_amu(void) { @@ -94,6 +104,14 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; u64 r_val, w_val; + CHECK_FGT_MASKS(HFGRTR_EL2); + CHECK_FGT_MASKS(HFGWTR_EL2); + CHECK_FGT_MASKS(HFGITR_EL2); + CHECK_FGT_MASKS(HDFGRTR_EL2); + CHECK_FGT_MASKS(HDFGWTR_EL2); + CHECK_FGT_MASKS(HAFGRTR_EL2); + CHECK_FGT_MASKS(HCRX_EL2); + if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; -- cgit v1.2.3 From 9ff67dd26a9eed9d73dc23aa63e87b16b3382184 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:52 +0000 Subject: KVM: arm64: Use generated FGT RES0 bits instead of specifying them Now that all FGT fields are accounted for and represented, use the generated value instead of manually specifying them. For __HFGWTR_EL2_RES0, however, there is no generated value. Its fields are subset of HFGRTR_EL2, with the remaining being RES0. Therefore, add a mask that represents the HFGRTR_EL2 only bits and define __HFGWTR_EL2_* using those and the __HFGRTR_EL2_* fields. No functional change intended. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-13-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index b0dc3249d5cd..bd20d27f1b33 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -344,34 +344,32 @@ * Once we get to a point where the two describe the same thing, we'll * merge the definitions. One day. */ -#define __HFGRTR_EL2_RES0 BIT(51) +#define __HFGRTR_EL2_RES0 HFGxTR_EL2_RES0 #define __HFGRTR_EL2_MASK GENMASK(49, 0) #define __HFGRTR_EL2_nMASK (GENMASK(63, 52) | BIT(50)) -#define __HFGWTR_EL2_RES0 (BIT(51) | BIT(46) | BIT(42) | BIT(40) | \ - BIT(28) | GENMASK(26, 25) | BIT(21) | BIT(18) | \ +/* + * The HFGWTR bits are a subset of HFGRTR bits. To ensure we don't miss any + * future additions, define __HFGWTR* macros relative to __HFGRTR* ones. + */ +#define __HFGRTR_ONLY_MASK (BIT(46) | BIT(42) | BIT(40) | BIT(28) | \ + GENMASK(26, 25) | BIT(21) | BIT(18) | \ GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) -#define __HFGWTR_EL2_MASK (GENMASK(49, 47) | GENMASK(45, 43) | \ - BIT(41) | GENMASK(39, 29) | BIT(27) | \ - GENMASK(24, 22) | GENMASK(20, 19) | \ - GENMASK(17, 16) | GENMASK(13, 11) | \ - GENMASK(8, 3) | GENMASK(1, 0)) -#define __HFGWTR_EL2_nMASK (GENMASK(63, 52) | BIT(50)) - -#define __HFGITR_EL2_RES0 (BIT(63) | BIT(61)) +#define __HFGWTR_EL2_RES0 (__HFGRTR_EL2_RES0 | __HFGRTR_ONLY_MASK) +#define __HFGWTR_EL2_MASK (__HFGRTR_EL2_MASK & ~__HFGRTR_ONLY_MASK) +#define __HFGWTR_EL2_nMASK (__HFGRTR_EL2_nMASK & ~__HFGRTR_ONLY_MASK) + +#define __HFGITR_EL2_RES0 HFGITR_EL2_RES0 #define __HFGITR_EL2_MASK (BIT(62) | BIT(60) | GENMASK(54, 0)) #define __HFGITR_EL2_nMASK GENMASK(59, 55) -#define __HDFGRTR_EL2_RES0 (BIT(49) | BIT(42) | GENMASK(39, 38) | \ - GENMASK(21, 20) | BIT(8)) +#define __HDFGRTR_EL2_RES0 HDFGRTR_EL2_RES0 #define __HDFGRTR_EL2_MASK (BIT(63) | GENMASK(58, 50) | GENMASK(48, 43) | \ GENMASK(41, 40) | GENMASK(37, 22) | \ GENMASK(19, 9) | GENMASK(7, 0)) #define __HDFGRTR_EL2_nMASK GENMASK(62, 59) -#define __HDFGWTR_EL2_RES0 (BIT(63) | GENMASK(59, 58) | BIT(51) | BIT(47) | \ - BIT(43) | GENMASK(40, 38) | BIT(34) | BIT(30) | \ - BIT(22) | BIT(9) | BIT(6)) +#define __HDFGWTR_EL2_RES0 HDFGWTR_EL2_RES0 #define __HDFGWTR_EL2_MASK (GENMASK(57, 52) | GENMASK(50, 48) | \ GENMASK(46, 44) | GENMASK(42, 41) | \ GENMASK(37, 35) | GENMASK(33, 31) | \ @@ -379,12 +377,12 @@ GENMASK(8, 7) | GENMASK(5, 0)) #define __HDFGWTR_EL2_nMASK GENMASK(62, 60) -#define __HAFGRTR_EL2_RES0 (GENMASK(63, 50) | GENMASK(16, 5)) +#define __HAFGRTR_EL2_RES0 HAFGRTR_EL2_RES0 #define __HAFGRTR_EL2_MASK (GENMASK(49, 17) | GENMASK(4, 0)) #define __HAFGRTR_EL2_nMASK 0UL /* Similar definitions for HCRX_EL2 */ -#define __HCRX_EL2_RES0 (GENMASK(63, 25) | GENMASK(13, 12)) +#define __HCRX_EL2_RES0 HCRX_EL2_RES0 #define __HCRX_EL2_MASK (BIT(6)) #define __HCRX_EL2_nMASK (GENMASK(24, 14) | GENMASK(11, 7) | GENMASK(5, 0)) -- cgit v1.2.3 From 5f6bd3f3daaaab8559ad7d2266ba38345231b7ae Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:53 +0000 Subject: KVM: arm64: Define FGT nMASK bits relative to other fields Now that RES0 and MASK have full coverage, no need to manually encode nMASK. Calculate it relative to the other fields. No functional change intended. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-14-tabba@google.com --- arch/arm64/include/asm/kvm_arm.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index bd20d27f1b33..b7a9fe36bb59 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -346,7 +346,7 @@ */ #define __HFGRTR_EL2_RES0 HFGxTR_EL2_RES0 #define __HFGRTR_EL2_MASK GENMASK(49, 0) -#define __HFGRTR_EL2_nMASK (GENMASK(63, 52) | BIT(50)) +#define __HFGRTR_EL2_nMASK ~(__HFGRTR_EL2_RES0 | __HFGRTR_EL2_MASK) /* * The HFGWTR bits are a subset of HFGRTR bits. To ensure we don't miss any @@ -357,17 +357,17 @@ GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) #define __HFGWTR_EL2_RES0 (__HFGRTR_EL2_RES0 | __HFGRTR_ONLY_MASK) #define __HFGWTR_EL2_MASK (__HFGRTR_EL2_MASK & ~__HFGRTR_ONLY_MASK) -#define __HFGWTR_EL2_nMASK (__HFGRTR_EL2_nMASK & ~__HFGRTR_ONLY_MASK) +#define __HFGWTR_EL2_nMASK ~(__HFGWTR_EL2_RES0 | __HFGWTR_EL2_MASK) #define __HFGITR_EL2_RES0 HFGITR_EL2_RES0 #define __HFGITR_EL2_MASK (BIT(62) | BIT(60) | GENMASK(54, 0)) -#define __HFGITR_EL2_nMASK GENMASK(59, 55) +#define __HFGITR_EL2_nMASK ~(__HFGITR_EL2_RES0 | __HFGITR_EL2_MASK) #define __HDFGRTR_EL2_RES0 HDFGRTR_EL2_RES0 #define __HDFGRTR_EL2_MASK (BIT(63) | GENMASK(58, 50) | GENMASK(48, 43) | \ GENMASK(41, 40) | GENMASK(37, 22) | \ GENMASK(19, 9) | GENMASK(7, 0)) -#define __HDFGRTR_EL2_nMASK GENMASK(62, 59) +#define __HDFGRTR_EL2_nMASK ~(__HDFGRTR_EL2_RES0 | __HDFGRTR_EL2_MASK) #define __HDFGWTR_EL2_RES0 HDFGWTR_EL2_RES0 #define __HDFGWTR_EL2_MASK (GENMASK(57, 52) | GENMASK(50, 48) | \ @@ -375,16 +375,16 @@ GENMASK(37, 35) | GENMASK(33, 31) | \ GENMASK(29, 23) | GENMASK(21, 10) | \ GENMASK(8, 7) | GENMASK(5, 0)) -#define __HDFGWTR_EL2_nMASK GENMASK(62, 60) +#define __HDFGWTR_EL2_nMASK ~(__HDFGWTR_EL2_RES0 | __HDFGWTR_EL2_MASK) #define __HAFGRTR_EL2_RES0 HAFGRTR_EL2_RES0 #define __HAFGRTR_EL2_MASK (GENMASK(49, 17) | GENMASK(4, 0)) -#define __HAFGRTR_EL2_nMASK 0UL +#define __HAFGRTR_EL2_nMASK ~(__HAFGRTR_EL2_RES0 | __HAFGRTR_EL2_MASK) /* Similar definitions for HCRX_EL2 */ #define __HCRX_EL2_RES0 HCRX_EL2_RES0 #define __HCRX_EL2_MASK (BIT(6)) -#define __HCRX_EL2_nMASK (GENMASK(24, 14) | GENMASK(11, 7) | GENMASK(5, 0)) +#define __HCRX_EL2_nMASK ~(__HCRX_EL2_RES0 | __HCRX_EL2_MASK) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) -- cgit v1.2.3 From 0ccd901da1886cf9dc53ab36ad8f1160b65e41f1 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:54 +0000 Subject: KVM: arm64: Macros for setting/clearing FGT bits There's a lot of boilerplate code for setting and clearing FGT bits when activating guest traps. Refactor it into macros. These macros will also be used in future patch series. No functional change intended. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-15-tabba@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 69 +++++++++++++-------------------- 1 file changed, 27 insertions(+), 42 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index ecccf99619e6..d56fef44dc31 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -79,6 +79,27 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) clr |= ~hfg & __ ## reg ## _nMASK; \ } while(0) +#define update_fgt_traps_cs(vcpu, reg, clr, set) \ + do { \ + struct kvm_cpu_context *hctxt = \ + &this_cpu_ptr(&kvm_host_data)->host_ctxt; \ + u64 c = 0, s = 0; \ + \ + ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ + compute_clr_set(vcpu, reg, c, s); \ + s |= set; \ + c |= clr; \ + if (c || s) { \ + u64 val = __ ## reg ## _nMASK; \ + val |= s; \ + val &= ~c; \ + write_sysreg_s(val, SYS_ ## reg); \ + } \ + } while(0) + +#define update_fgt_traps(vcpu, reg) \ + update_fgt_traps_cs(vcpu, reg, 0, 0) + /* * Validate the fine grain trap masks. * Check that the masks do not overlap and that all bits are accounted for. @@ -154,48 +175,12 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) return; - ctxt_sys_reg(hctxt, HFGITR_EL2) = read_sysreg_s(SYS_HFGITR_EL2); - - r_set = r_clr = 0; - compute_clr_set(vcpu, HFGITR_EL2, r_clr, r_set); - r_val = __HFGITR_EL2_nMASK; - r_val |= r_set; - r_val &= ~r_clr; - - write_sysreg_s(r_val, SYS_HFGITR_EL2); - - ctxt_sys_reg(hctxt, HDFGRTR_EL2) = read_sysreg_s(SYS_HDFGRTR_EL2); - ctxt_sys_reg(hctxt, HDFGWTR_EL2) = read_sysreg_s(SYS_HDFGWTR_EL2); - - r_clr = r_set = w_clr = w_set = 0; - - compute_clr_set(vcpu, HDFGRTR_EL2, r_clr, r_set); - compute_clr_set(vcpu, HDFGWTR_EL2, w_clr, w_set); - - r_val = __HDFGRTR_EL2_nMASK; - r_val |= r_set; - r_val &= ~r_clr; - - w_val = __HDFGWTR_EL2_nMASK; - w_val |= w_set; - w_val &= ~w_clr; - - write_sysreg_s(r_val, SYS_HDFGRTR_EL2); - write_sysreg_s(w_val, SYS_HDFGWTR_EL2); - - if (!cpu_has_amu()) - return; - - ctxt_sys_reg(hctxt, HAFGRTR_EL2) = read_sysreg_s(SYS_HAFGRTR_EL2); - - r_clr = r_set = 0; - compute_clr_set(vcpu, HAFGRTR_EL2, r_clr, r_set); - - r_val = __HAFGRTR_EL2_nMASK; - r_val |= r_set; - r_val &= ~r_clr; + update_fgt_traps(vcpu, HFGITR_EL2); + update_fgt_traps(vcpu, HDFGRTR_EL2); + update_fgt_traps(vcpu, HDFGWTR_EL2); - write_sysreg_s(r_val, SYS_HAFGRTR_EL2); + if (cpu_has_amu()) + update_fgt_traps(vcpu, HAFGRTR_EL2); } static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) @@ -215,7 +200,7 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) write_sysreg_s(ctxt_sys_reg(hctxt, HDFGRTR_EL2), SYS_HDFGRTR_EL2); write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); - if (vcpu_has_amu()) + if (cpu_has_amu()) write_sysreg_s(ctxt_sys_reg(hctxt, HAFGRTR_EL2), SYS_HAFGRTR_EL2); } -- cgit v1.2.3 From 73e3ce3f4a0e561e24ca71b20de00f03b427981e Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:55 +0000 Subject: KVM: arm64: Fix which features are marked as allowed for protected VMs Cache maintenance operations are not trapped for protected VMs, and shouldn't be. Mark them as allowed. Moreover, features advertised by ID_AA64PFR2 and ID_AA64MMFR3 are (already) not allowed, mark them as such. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-16-tabba@google.com --- arch/arm64/kvm/hyp/include/nvhe/fixed_config.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index e91922daa8ca..8d97dff4bb7b 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -69,6 +69,8 @@ ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ ) +#define PVM_ID_AA64PFR2_ALLOW 0ULL + /* * Allow for protected VMs: * - Mixed-endian @@ -101,6 +103,7 @@ * - Privileged Access Never * - SError interrupt exceptions from speculative reads * - Enhanced Translation Synchronization + * - Control for cache maintenance permission */ #define PVM_ID_AA64MMFR1_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ @@ -108,7 +111,8 @@ ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \ ) /* @@ -133,6 +137,8 @@ ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ ) +#define PVM_ID_AA64MMFR3_ALLOW (0ULL) + /* * No support for Scalable Vectors for protected VMs: * Requires additional support from KVM, e.g., context-switching and -- cgit v1.2.3 From 21de26dbc5170dde8e4dfbfa1ecb77804ed6a377 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:56 +0000 Subject: KVM: arm64: Mark PAuth as a restricted feature for protected VMs Protected VMs will only support basic PAuth (FEAT_PAuth). Mark it as restricted to ensure that later versions aren't supported for protected guests. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-17-tabba@google.com --- arch/arm64/kvm/hyp/include/nvhe/fixed_config.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 8d97dff4bb7b..51f043649146 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -184,10 +184,18 @@ ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ ) +/* Restrict pointer authentication to the basic version. */ +#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ + ) + +#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ + ) + #define PVM_ID_AA64ISAR1_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ @@ -202,8 +210,8 @@ ) #define PVM_ID_AA64ISAR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | \ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ ) -- cgit v1.2.3 From 9d52612690985fc0ee1ae1fbad61530a4f6bbb53 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:57 +0000 Subject: KVM: arm64: Trap external trace for protected VMs pKVM does not support external trace for protected VMs. Trap external trace, and add the ExtTrcBuff to make it possible to check for the feature. Signed-off-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231214100158.2305400-18-tabba@google.com --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 9d23a51d7f75..84b5c3f387d8 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -136,6 +136,10 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) cptr_set |= CPTR_EL2_TTA; } + /* Trap External Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) + mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + vcpu->arch.mdcr_el2 |= mdcr_set; vcpu->arch.mdcr_el2 &= ~mdcr_clear; vcpu->arch.cptr_el2 |= cptr_set; -- cgit v1.2.3 From 7ab6fb505b2a7447c4a7237a12c59e3ad0c7298c Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Dec 2023 10:48:27 +0800 Subject: LoongArch: KVM: Optimization for memslot hugepage checking During shadow mmu page fault, there is checking for huge page for specified memslot. Page fault is hot path, check logic can be done when memslot is created. Here two flags are added for huge page checking, KVM_MEM_HUGEPAGE_CAPABLE and KVM_MEM_HUGEPAGE_INCAPABLE. Indeed for an optimized qemu, memslot for DRAM is always huge page aligned. The flag is firstly checked during hot page fault path. Now only huge page flag is supported, there is a long way for super page support in LoongArch system. Since super page size is 64G for 16K pagesize and 1G for 4K pagesize, 64G physical address is rarely used and LoongArch kernel needs support super page for 4K. Also memory layout of LoongArch qemu VM should be 1G aligned. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 3 + arch/loongarch/kvm/mmu.c | 124 +++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 11328700d4fa..0e89db020481 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -45,7 +45,10 @@ struct kvm_vcpu_stat { u64 signal_exits; }; +#define KVM_MEM_HUGEPAGE_CAPABLE (1UL << 0) +#define KVM_MEM_HUGEPAGE_INCAPABLE (1UL << 1) struct kvm_arch_memory_slot { + unsigned long flags; }; struct kvm_context { diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 80480df5f550..915f17527893 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -13,6 +13,16 @@ #include #include +static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot) +{ + return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE; +} + +static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot) +{ + return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE; +} + static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) { ctx->level = kvm->arch.root_level; @@ -365,6 +375,69 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); } +int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) +{ + gpa_t gpa_start; + hva_t hva_start; + size_t size, gpa_offset, hva_offset; + + if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) + return 0; + /* + * Prevent userspace from creating a memory region outside of the + * VM GPA address space + */ + if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) + return -ENOMEM; + + new->arch.flags = 0; + size = new->npages * PAGE_SIZE; + gpa_start = new->base_gfn << PAGE_SHIFT; + hva_start = new->userspace_addr; + if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE) + && IS_ALIGNED(hva_start, PMD_SIZE)) + new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; + else { + /* + * Pages belonging to memslots that don't have the same + * alignment within a PMD for userspace and GPA cannot be + * mapped with PMD entries, because we'll end up mapping + * the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this + * incorrect mapping: + * d -> f + * e -> g + * f -> h + */ + gpa_offset = gpa_start & (PMD_SIZE - 1); + hva_offset = hva_start & (PMD_SIZE - 1); + if (gpa_offset != hva_offset) { + new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; + } else { + if (gpa_offset == 0) + gpa_offset = PMD_SIZE; + if ((size + gpa_offset) < (PMD_SIZE * 2)) + new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; + } + } + + return 0; +} + void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, @@ -562,47 +635,23 @@ out: } static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, - unsigned long hva, unsigned long map_size, bool write) + unsigned long hva, bool write) { - size_t size; - gpa_t gpa_start; - hva_t uaddr_start, uaddr_end; + hva_t start, end; /* Disable dirty logging on HugePages */ if (kvm_slot_dirty_track_enabled(memslot) && write) return false; - size = memslot->npages * PAGE_SIZE; - gpa_start = memslot->base_gfn << PAGE_SHIFT; - uaddr_start = memslot->userspace_addr; - uaddr_end = uaddr_start + size; + if (kvm_hugepage_capable(memslot)) + return true; - /* - * Pages belonging to memslots that don't have the same alignment - * within a PMD for userspace and GPA cannot be mapped with stage-2 - * PMD entries, because we'll end up mapping the wrong pages. - * - * Consider a layout like the following: - * - * memslot->userspace_addr: - * +-----+--------------------+--------------------+---+ - * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| - * +-----+--------------------+--------------------+---+ - * - * memslot->base_gfn << PAGE_SIZE: - * +---+--------------------+--------------------+-----+ - * |abc|def Stage-2 block | Stage-2 block |tvxyz| - * +---+--------------------+--------------------+-----+ - * - * If we create those stage-2 blocks, we'll end up with this incorrect - * mapping: - * d -> f - * e -> g - * f -> h - */ - if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + if (kvm_hugepage_incapable(memslot)) return false; + start = memslot->userspace_addr; + end = start + memslot->npages * PAGE_SIZE; + /* * Next, let's make sure we're not trying to map anything not covered * by the memslot. This means we have to prohibit block size mappings @@ -615,8 +664,7 @@ static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, * userspace_addr or the base_gfn, as both are equally aligned (per * the check above) and equally sized. */ - return (hva & ~(map_size - 1)) >= uaddr_start && - (hva & ~(map_size - 1)) + map_size <= uaddr_end; + return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE)); } /* @@ -842,7 +890,7 @@ retry: /* Disable dirty logging on HugePages */ level = 0; - if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) { + if (!fault_supports_huge_mapping(memslot, hva, write)) { level = 0; } else { level = host_pfn_mapping_level(kvm, gfn, memslot); @@ -901,12 +949,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { } -int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, - struct kvm_memory_slot *new, enum kvm_mr_change change) -{ - return 0; -} - void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot) { -- cgit v1.2.3 From 161267320158920a601e40d83fdac60bcaa2acb5 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Dec 2023 10:48:27 +0800 Subject: LoongArch: KVM: Remove SW timer switch when vcpu is halt polling With halt-polling supported, there is checking for pending events or interrupts when vcpu executes idle instruction. Pending interrupts include injected SW interrupts and passthrough HW interrupts, such as HW timer interrupts, since HW timer works still even if vcpu exists from VM mode. Since HW timer pending interrupt can be set directly with CSR status register, and pending HW timer interrupt checking is used in vcpu block checking function, it is not necessary to switch to SW timer during halt-polling. This patch adds preemption disabling in function kvm_cpu_has_pending_timer(), and removes SW timer switching in idle instruction emulation function. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 13 ++----------- arch/loongarch/kvm/timer.c | 12 +++++++++--- arch/loongarch/kvm/vcpu.c | 9 ++++++++- 3 files changed, 19 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index ce8de3fa472c..e708a1786d6b 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -200,17 +200,8 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu) ++vcpu->stat.idle_exits; trace_kvm_exit_idle(vcpu, KVM_TRACE_EXIT_IDLE); - if (!kvm_arch_vcpu_runnable(vcpu)) { - /* - * Switch to the software timer before halt-polling/blocking as - * the guest's timer may be a break event for the vCPU, and the - * hypervisor timer runs only when the CPU is in guest mode. - * Switch before halt-polling so that KVM recognizes an expired - * timer before blocking. - */ - kvm_save_timer(vcpu); - kvm_vcpu_block(vcpu); - } + if (!kvm_arch_vcpu_runnable(vcpu)) + kvm_vcpu_halt(vcpu); return EMULATE_DONE; } diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 284bf553fefe..12d58040122d 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -155,11 +155,17 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu) */ hrtimer_cancel(&vcpu->arch.swtimer); hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); - } else + } else if (vcpu->stat.generic.blocking) { /* - * Inject timer interrupt so that hall polling can dectect and exit + * Inject timer interrupt so that halt polling can dectect and exit. + * VCPU is scheduled out already and sleeps in rcuwait queue and + * will not poll pending events again. kvm_queue_irq() is not enough, + * hrtimer swtimer should be used here. */ - kvm_queue_irq(vcpu, INT_TI); + expire = ktime_add_ns(ktime_get(), 10); + vcpu->arch.expire = expire; + hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); + } } /* diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 73d0c2b9c1a5..54f544b30f32 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -187,8 +187,15 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return kvm_pending_timer(vcpu) || + int ret; + + /* Protect from TOD sync and vcpu_load/put() */ + preempt_disable(); + ret = kvm_pending_timer(vcpu) || kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT) & (1 << INT_TI); + preempt_enable(); + + return ret; } int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 0d2abe67029644741bf7400b0d00c2faa3e1c455 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Dec 2023 10:48:27 +0800 Subject: LoongArch: KVM: Allow to access HW timer CSR registers always Currently HW timer CSR registers are allowed to access before entering to vm and disabled if switch to SW timer in host mode, instead it is not necessary to do so. HW timer CSR registers can be accessed always, it is nothing to do with whether it is in vm mode or host mode. This patch removes the limitation. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/main.c | 1 - arch/loongarch/kvm/timer.c | 27 ++++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 1c1d5199500e..86a2f2d0cb27 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -287,7 +287,6 @@ int kvm_arch_hardware_enable(void) if (env & CSR_GCFG_MATC_ROOT) gcfg |= CSR_GCFG_MATC_ROOT; - gcfg |= CSR_GCFG_TIT; write_csr_gcfg(gcfg); kvm_flush_tlb_all(); diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 12d58040122d..d6d5bcea349b 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -70,15 +70,6 @@ void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz) */ void kvm_acquire_timer(struct kvm_vcpu *vcpu) { - unsigned long cfg; - - cfg = read_csr_gcfg(); - if (!(cfg & CSR_GCFG_TIT)) - return; - - /* Enable guest access to hard timer */ - write_csr_gcfg(cfg & ~CSR_GCFG_TIT); - /* * Freeze the soft-timer and sync the guest stable timer with it. We do * this with interrupts disabled to avoid latency. @@ -174,21 +165,15 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu) */ void kvm_save_timer(struct kvm_vcpu *vcpu) { - unsigned long cfg; struct loongarch_csrs *csr = vcpu->arch.csr; preempt_disable(); - cfg = read_csr_gcfg(); - if (!(cfg & CSR_GCFG_TIT)) { - /* Disable guest use of hard timer */ - write_csr_gcfg(cfg | CSR_GCFG_TIT); - - /* Save hard timer state */ - kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG); - kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL); - if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN) - _kvm_save_timer(vcpu); - } + + /* Save hard timer state */ + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL); + if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN) + _kvm_save_timer(vcpu); /* Save timer-related state to vCPU context */ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); -- cgit v1.2.3 From 1ab9c6099495f79bfbcd6058d02d7556034a89b0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Dec 2023 10:48:28 +0800 Subject: LoongArch: KVM: Remove kvm_acquire_timer() before entering guest Timer emulation method in VM is switch to SW timer, there are two places where timer emulation is needed. One is during vcpu thread context switch, the other is halt-polling with idle instruction emulation. SW timer switching is removed during halt-polling mode, so it is not necessary to disable SW timer before entering to guest. This patch removes SW timer handling before entering guest mode, and put it in HW timer restoring flow when vcpu thread is sched-in. With this patch, vm timer emulation is simpler, there is SW/HW timer switch only in vcpu thread context switch scenario. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_vcpu.h | 1 - arch/loongarch/kvm/timer.c | 22 ++++++---------------- arch/loongarch/kvm/vcpu.c | 29 ----------------------------- 3 files changed, 6 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index 553cfa2b2b1c..0e87652f780a 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -55,7 +55,6 @@ void kvm_save_fpu(struct loongarch_fpu *fpu); void kvm_restore_fpu(struct loongarch_fpu *fpu); void kvm_restore_fcsr(struct loongarch_fpu *fpu); -void kvm_acquire_timer(struct kvm_vcpu *vcpu); void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index d6d5bcea349b..d362d87a54aa 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -64,19 +64,6 @@ void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz) kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TVAL, 0); } -/* - * Restore hard timer state and enable guest to access timer registers - * without trap, should be called with irq disabled - */ -void kvm_acquire_timer(struct kvm_vcpu *vcpu) -{ - /* - * Freeze the soft-timer and sync the guest stable timer with it. We do - * this with interrupts disabled to avoid latency. - */ - hrtimer_cancel(&vcpu->arch.swtimer); -} - /* * Restore soft timer state from saved context. */ @@ -98,6 +85,11 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) return; } + /* + * Freeze the soft-timer and sync the guest stable timer with it. + */ + hrtimer_cancel(&vcpu->arch.swtimer); + /* * Set remainder tick value if not expired */ @@ -115,7 +107,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) /* * Inject timer here though sw timer should inject timer * interrupt async already, since sw timer may be cancelled - * during injecting intr async in function kvm_acquire_timer + * during injecting intr async */ kvm_queue_irq(vcpu, INT_TI); } @@ -140,11 +132,9 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu) vcpu->arch.expire = expire; if (ticks) { /* - * Update hrtimer to use new timeout * HRTIMER_MODE_PINNED is suggested since vcpu may run in * the same physical cpu in next time */ - hrtimer_cancel(&vcpu->arch.swtimer); hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); } else if (vcpu->stat.generic.blocking) { /* diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 54f544b30f32..53fcef8b24a1 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -95,7 +95,6 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) * check vmid before vcpu enter guest */ local_irq_disable(); - kvm_acquire_timer(vcpu); kvm_deliver_intr(vcpu); kvm_deliver_exception(vcpu); /* Make sure the vcpu mode has been written */ @@ -251,23 +250,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -EINVAL; } -/** - * kvm_migrate_count() - Migrate timer. - * @vcpu: Virtual CPU. - * - * Migrate hrtimer to the current CPU by cancelling and restarting it - * if the hrtimer is active. - * - * Must be called when the vCPU is migrated to a different CPU, so that - * the timer can interrupt the guest at the new CPU, and the timer irq can - * be delivered to the vCPU. - */ -static void kvm_migrate_count(struct kvm_vcpu *vcpu) -{ - if (hrtimer_cancel(&vcpu->arch.swtimer)) - hrtimer_restart(&vcpu->arch.swtimer); -} - static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 *val) { unsigned long gintc; @@ -796,17 +778,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) unsigned long flags; local_irq_save(flags); - if (vcpu->arch.last_sched_cpu != cpu) { - kvm_debug("[%d->%d]KVM vCPU[%d] switch\n", - vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); - /* - * Migrate the timer interrupt to the current CPU so that it - * always interrupts the guest and synchronously triggers a - * guest timer interrupt. - */ - kvm_migrate_count(vcpu); - } - /* Restore guest state to registers */ _kvm_vcpu_load(vcpu, cpu); local_irq_restore(flags); -- cgit v1.2.3 From 5b3d524993ff1fb36089be850ccb121ac3296bcf Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Dec 2023 10:48:28 +0800 Subject: LoongArch: KVM: Fix timer emulation with oneshot mode When timer is fired in oneshot mode, CSR TVAL will be -1 rather than 0. There needs special handing for this situation. There are two scenarios when oneshot timer is fired. One scenario is that time is fired after exiting to host, CSR TVAL is set with 0 in order to inject hw interrupt, and -1 will assigned to CSR TVAL soon. The other situation is that timer is fired in VM and guest kernel is hanlding timer IRQ, IRQ is acked and is ready to set next expired timer value, then vm exits to host. Timer interrupt should not be inject at this point, else there will be spurious timer interrupt. Here hw timer irq status in CSR ESTAT is used to judge these two scenarios. If CSR TVAL is -1, the oneshot timer is fired; and if timer hw irq is on in CSR ESTAT register, it happens after exiting to host; else if timer hw irq is off, we think that it happens in vm and timer IRQ handler has already acked IRQ. With this patch, runltp with version ltp20230516 passes to run in vm. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/timer.c | 68 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index d362d87a54aa..111328f60872 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -69,14 +69,19 @@ void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz) */ void kvm_restore_timer(struct kvm_vcpu *vcpu) { - unsigned long cfg, delta, period; + unsigned long cfg, estat; + unsigned long ticks, delta, period; ktime_t expire, now; struct loongarch_csrs *csr = vcpu->arch.csr; /* * Set guest stable timer cfg csr + * Disable timer before restore estat CSR register, avoid to + * get invalid timer interrupt for old timer cfg */ cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG); + + write_gcsr_timercfg(0); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TCFG); if (!(cfg & CSR_TCFG_EN)) { @@ -90,20 +95,47 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) */ hrtimer_cancel(&vcpu->arch.swtimer); + /* + * From LoongArch Reference Manual Volume 1 Chapter 7.6.2 + * If oneshot timer is fired, CSR TVAL will be -1, there are two + * conditions: + * 1) timer is fired during exiting to host + * 2) timer is fired and vm is doing timer irq, and then exiting to + * host. Host should not inject timer irq to avoid spurious + * timer interrupt again + */ + ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL); + estat = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT); + if (!(cfg & CSR_TCFG_PERIOD) && (ticks > cfg)) { + /* + * Writing 0 to LOONGARCH_CSR_TVAL will inject timer irq + * and set CSR TVAL with -1 + */ + write_gcsr_timertick(0); + + /* + * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear + * timer interrupt, and CSR TVAL keeps unchanged with -1, it + * avoids spurious timer interrupt + */ + if (!(estat & CPU_TIMER)) + gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + return; + } + /* * Set remainder tick value if not expired */ + delta = 0; now = ktime_get(); expire = vcpu->arch.expire; if (ktime_before(now, expire)) delta = ktime_to_tick(vcpu, ktime_sub(expire, now)); - else { - if (cfg & CSR_TCFG_PERIOD) { - period = cfg & CSR_TCFG_VAL; - delta = ktime_to_tick(vcpu, ktime_sub(now, expire)); - delta = period - (delta % period); - } else - delta = 0; + else if (cfg & CSR_TCFG_PERIOD) { + period = cfg & CSR_TCFG_VAL; + delta = ktime_to_tick(vcpu, ktime_sub(now, expire)); + delta = period - (delta % period); + /* * Inject timer here though sw timer should inject timer * interrupt async already, since sw timer may be cancelled @@ -122,15 +154,25 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) */ static void _kvm_save_timer(struct kvm_vcpu *vcpu) { - unsigned long ticks, delta; + unsigned long ticks, delta, cfg; ktime_t expire; struct loongarch_csrs *csr = vcpu->arch.csr; + cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG); ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL); - delta = tick_to_ns(vcpu, ticks); - expire = ktime_add_ns(ktime_get(), delta); - vcpu->arch.expire = expire; - if (ticks) { + + /* + * From LoongArch Reference Manual Volume 1 Chapter 7.6.2 + * If period timer is fired, CSR TVAL will be reloaded from CSR TCFG + * If oneshot timer is fired, CSR TVAL will be -1 + * Here judge one-shot timer fired by checking whether TVAL is larger + * than TCFG + */ + if (ticks < cfg) { + delta = tick_to_ns(vcpu, ticks); + expire = ktime_add_ns(ktime_get(), delta); + vcpu->arch.expire = expire; + /* * HRTIMER_MODE_PINNED is suggested since vcpu may run in * the same physical cpu in next time -- cgit v1.2.3 From db1ecca22edf27c5a3dd66af406c88b5b5ac7cc1 Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Tue, 19 Dec 2023 10:48:28 +0800 Subject: LoongArch: KVM: Add LSX (128bit SIMD) support This patch adds LSX (128bit SIMD) support for LoongArch KVM. There will be LSX exception in KVM when guest use the LSX instructions. KVM will enable LSX and restore the vector registers for guest and then return to guest to continue running. Signed-off-by: Tianrui Zhao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 15 ++- arch/loongarch/include/asm/kvm_vcpu.h | 10 ++ arch/loongarch/include/uapi/asm/kvm.h | 1 + arch/loongarch/kernel/fpu.S | 1 + arch/loongarch/kvm/exit.c | 21 ++++ arch/loongarch/kvm/switch.S | 16 +++ arch/loongarch/kvm/trace.h | 4 +- arch/loongarch/kvm/vcpu.c | 220 +++++++++++++++++++++++++++++++++- 8 files changed, 280 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 0e89db020481..b0c5cdd8014c 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -95,8 +95,9 @@ enum emulation_result { }; #define KVM_LARCH_FPU (0x1 << 0) -#define KVM_LARCH_SWCSR_LATEST (0x1 << 1) -#define KVM_LARCH_HWCSR_USABLE (0x1 << 2) +#define KVM_LARCH_LSX (0x1 << 1) +#define KVM_LARCH_SWCSR_LATEST (0x1 << 2) +#define KVM_LARCH_HWCSR_USABLE (0x1 << 3) struct kvm_vcpu_arch { /* @@ -178,6 +179,16 @@ static inline void writel_sw_gcsr(struct loongarch_csrs *csr, int reg, unsigned csr->csrs[reg] = val; } +static inline bool kvm_guest_has_fpu(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[2] & CPUCFG2_FP; +} + +static inline bool kvm_guest_has_lsx(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[2] & CPUCFG2_LSX; +} + /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index 0e87652f780a..db08dd46b525 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -55,6 +55,16 @@ void kvm_save_fpu(struct loongarch_fpu *fpu); void kvm_restore_fpu(struct loongarch_fpu *fpu); void kvm_restore_fcsr(struct loongarch_fpu *fpu); +#ifdef CONFIG_CPU_HAS_LSX +int kvm_own_lsx(struct kvm_vcpu *vcpu); +void kvm_save_lsx(struct loongarch_fpu *fpu); +void kvm_restore_lsx(struct loongarch_fpu *fpu); +#else +static inline int kvm_own_lsx(struct kvm_vcpu *vcpu) { } +static inline void kvm_save_lsx(struct loongarch_fpu *fpu) { } +static inline void kvm_restore_lsx(struct loongarch_fpu *fpu) { } +#endif + void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index c6ad2ee6106c..923d0bd38294 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -79,6 +79,7 @@ struct kvm_fpu { #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) +#define KVM_LOONGARCH_VCPU_CPUCFG 0 struct kvm_debug_exit_arch { }; diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index d53ab10f4644..a400924c0348 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -349,6 +349,7 @@ SYM_FUNC_START(_restore_lsx_upper) lsx_restore_all_upper a0 t0 t1 jr ra SYM_FUNC_END(_restore_lsx_upper) +EXPORT_SYMBOL(_restore_lsx_upper) SYM_FUNC_START(_init_lsx_upper) lsx_init_all_upper t1 diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index e708a1786d6b..676f7a3a335c 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -634,6 +634,11 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; + if (!kvm_guest_has_fpu(&vcpu->arch)) { + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + return RESUME_GUEST; + } + /* * If guest FPU not present, the FPU operation should have been * treated as a reserved instruction! @@ -650,6 +655,21 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root. + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use LSX when it is disabled in the root + * context. + */ +static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu) +{ + if (kvm_own_lsx(vcpu)) + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + /* * LoongArch KVM callback handling for unimplemented guest exiting */ @@ -678,6 +698,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { [EXCCODE_TLBS] = kvm_handle_write_fault, [EXCCODE_TLBM] = kvm_handle_write_fault, [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, + [EXCCODE_LSXDIS] = kvm_handle_lsx_disabled, [EXCCODE_GSPR] = kvm_handle_gspr, }; diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 0ed9040307b7..00fbf772d16f 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -245,6 +245,22 @@ SYM_FUNC_START(kvm_restore_fpu) jr ra SYM_FUNC_END(kvm_restore_fpu) +#ifdef CONFIG_CPU_HAS_LSX +SYM_FUNC_START(kvm_save_lsx) + fpu_save_csr a0 t1 + fpu_save_cc a0 t1 t2 + lsx_save_data a0 t1 + jr ra +SYM_FUNC_END(kvm_save_lsx) + +SYM_FUNC_START(kvm_restore_lsx) + lsx_restore_data a0 t1 + fpu_restore_cc a0 t1 t2 + fpu_restore_csr a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_restore_lsx) +#endif + .section ".rodata" SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h index a1e35d655418..7da4e230e896 100644 --- a/arch/loongarch/kvm/trace.h +++ b/arch/loongarch/kvm/trace.h @@ -102,6 +102,7 @@ TRACE_EVENT(kvm_exit_gspr, #define KVM_TRACE_AUX_DISCARD 4 #define KVM_TRACE_AUX_FPU 1 +#define KVM_TRACE_AUX_LSX 2 #define kvm_trace_symbol_aux_op \ { KVM_TRACE_AUX_SAVE, "save" }, \ @@ -111,7 +112,8 @@ TRACE_EVENT(kvm_exit_gspr, { KVM_TRACE_AUX_DISCARD, "discard" } #define kvm_trace_symbol_aux_state \ - { KVM_TRACE_AUX_FPU, "FPU" } + { KVM_TRACE_AUX_FPU, "FPU" }, \ + { KVM_TRACE_AUX_LSX, "LSX" } TRACE_EVENT(kvm_aux, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 53fcef8b24a1..80487d177ca4 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -298,6 +298,69 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) return ret; } +static int _kvm_get_cpucfg(int id, u64 *v) +{ + int ret = 0; + + if (id < 0 && id >= KVM_MAX_CPUCFG_REGS) + return -EINVAL; + + switch (id) { + case 2: + /* Return CPUCFG2 features which have been supported by KVM */ + *v = CPUCFG2_FP | CPUCFG2_FPSP | CPUCFG2_FPDP | + CPUCFG2_FPVERS | CPUCFG2_LLFTP | CPUCFG2_LLFTPREV | + CPUCFG2_LAM; + /* + * If LSX is supported by CPU, it is also supported by KVM, + * as we implement it. + */ + if (cpu_has_lsx) + *v |= CPUCFG2_LSX; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int kvm_check_cpucfg(int id, u64 val) +{ + u64 mask; + int ret = 0; + + if (id < 0 && id >= KVM_MAX_CPUCFG_REGS) + return -EINVAL; + + if (_kvm_get_cpucfg(id, &mask)) + return ret; + + switch (id) { + case 2: + /* CPUCFG2 features checking */ + if (val & ~mask) + /* The unsupported features should not be set */ + ret = -EINVAL; + else if (!(val & CPUCFG2_LLFTP)) + /* The LLFTP must be set, as guest must has a constant timer */ + ret = -EINVAL; + else if ((val & CPUCFG2_FP) && (!(val & CPUCFG2_FPSP) || !(val & CPUCFG2_FPDP))) + /* Single and double float point must both be set when enable FP */ + ret = -EINVAL; + else if ((val & CPUCFG2_LSX) && !(val & CPUCFG2_FP)) + /* FP should be set when enable LSX */ + ret = -EINVAL; + else if ((val & CPUCFG2_LASX) && !(val & CPUCFG2_LSX)) + /* LSX, FP should be set when enable LASX, and FP has been checked before. */ + ret = -EINVAL; + break; + default: + break; + } + return ret; +} + static int kvm_get_one_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, u64 *v) { @@ -367,10 +430,10 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, break; case KVM_REG_LOONGARCH_CPUCFG: id = KVM_GET_IOC_CPUCFG_IDX(reg->id); - if (id >= 0 && id < KVM_MAX_CPUCFG_REGS) - vcpu->arch.cpucfg[id] = (u32)v; - else - ret = -EINVAL; + ret = kvm_check_cpucfg(id, v); + if (ret) + break; + vcpu->arch.cpucfg[id] = (u32)v; break; case KVM_REG_LOONGARCH_KVM: switch (reg->id) { @@ -460,10 +523,94 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; } +static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case 2: + return 0; + default: + return -ENXIO; + } + + return -ENXIO; +} + +static int kvm_loongarch_vcpu_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + case KVM_LOONGARCH_VCPU_CPUCFG: + ret = kvm_loongarch_cpucfg_has_attr(vcpu, attr); + break; + default: + break; + } + + return ret; +} + +static int kvm_loongarch_get_cpucfg_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = 0; + uint64_t val; + uint64_t __user *uaddr = (uint64_t __user *)attr->addr; + + ret = _kvm_get_cpucfg(attr->attr, &val); + if (ret) + return ret; + + put_user(val, uaddr); + + return ret; +} + +static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + case KVM_LOONGARCH_VCPU_CPUCFG: + ret = kvm_loongarch_get_cpucfg_attr(vcpu, attr); + break; + default: + break; + } + + return ret; +} + +static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + return -ENXIO; +} + +static int kvm_loongarch_vcpu_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + case KVM_LOONGARCH_VCPU_CPUCFG: + ret = kvm_loongarch_cpucfg_set_attr(vcpu, attr); + break; + default: + break; + } + + return ret; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long r; + struct kvm_device_attr attr; void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; @@ -503,6 +650,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_loongarch_vcpu_has_attr(vcpu, &attr); + break; + } + case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_loongarch_vcpu_get_attr(vcpu, &attr); + break; + } + case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_loongarch_vcpu_set_attr(vcpu, &attr); + break; + } default: r = -ENOIOCTLCMD; break; @@ -550,12 +718,54 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) preempt_enable(); } +#ifdef CONFIG_CPU_HAS_LSX +/* Enable LSX and restore context */ +int kvm_own_lsx(struct kvm_vcpu *vcpu) +{ + if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch)) + return -EINVAL; + + preempt_disable(); + + /* Enable LSX for guest */ + set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN); + switch (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + case KVM_LARCH_FPU: + /* + * Guest FPU state already loaded, + * only restore upper LSX state + */ + _restore_lsx_upper(&vcpu->arch.fpu); + break; + default: + /* Neither FP or LSX already active, + * restore full LSX state + */ + kvm_restore_lsx(&vcpu->arch.fpu); + break; + } + + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LSX); + vcpu->arch.aux_inuse |= KVM_LARCH_LSX | KVM_LARCH_FPU; + preempt_enable(); + + return 0; +} +#endif + /* Save context and disable FPU */ void kvm_lose_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + if (vcpu->arch.aux_inuse & KVM_LARCH_LSX) { + kvm_save_lsx(&vcpu->arch.fpu); + vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_LSX); + + /* Disable LSX & FPU */ + clear_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN); + } else if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { kvm_save_fpu(&vcpu->arch.fpu); vcpu->arch.aux_inuse &= ~KVM_LARCH_FPU; trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU); -- cgit v1.2.3 From 118e10cd893d57df55b3302dfd188a981b6e6d1c Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Tue, 19 Dec 2023 10:48:28 +0800 Subject: LoongArch: KVM: Add LASX (256bit SIMD) support This patch adds LASX (256bit SIMD) support for LoongArch KVM. There will be LASX exception in KVM when guest use the LASX instructions. KVM will enable LASX and restore the vector registers for guest and then return to guest to continue running. Reviewed-by: Bibo Mao Signed-off-by: Tianrui Zhao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 10 +++++-- arch/loongarch/include/asm/kvm_vcpu.h | 10 +++++++ arch/loongarch/kernel/fpu.S | 1 + arch/loongarch/kvm/exit.c | 16 +++++++++++ arch/loongarch/kvm/switch.S | 15 +++++++++++ arch/loongarch/kvm/trace.h | 4 ++- arch/loongarch/kvm/vcpu.c | 51 ++++++++++++++++++++++++++++++++++- 7 files changed, 103 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index b0c5cdd8014c..5bdb34b2c5d6 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -96,8 +96,9 @@ enum emulation_result { #define KVM_LARCH_FPU (0x1 << 0) #define KVM_LARCH_LSX (0x1 << 1) -#define KVM_LARCH_SWCSR_LATEST (0x1 << 2) -#define KVM_LARCH_HWCSR_USABLE (0x1 << 3) +#define KVM_LARCH_LASX (0x1 << 2) +#define KVM_LARCH_SWCSR_LATEST (0x1 << 3) +#define KVM_LARCH_HWCSR_USABLE (0x1 << 4) struct kvm_vcpu_arch { /* @@ -189,6 +190,11 @@ static inline bool kvm_guest_has_lsx(struct kvm_vcpu_arch *arch) return arch->cpucfg[2] & CPUCFG2_LSX; } +static inline bool kvm_guest_has_lasx(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[2] & CPUCFG2_LASX; +} + /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index db08dd46b525..e71ceb88f29e 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -65,6 +65,16 @@ static inline void kvm_save_lsx(struct loongarch_fpu *fpu) { } static inline void kvm_restore_lsx(struct loongarch_fpu *fpu) { } #endif +#ifdef CONFIG_CPU_HAS_LASX +int kvm_own_lasx(struct kvm_vcpu *vcpu); +void kvm_save_lasx(struct loongarch_fpu *fpu); +void kvm_restore_lasx(struct loongarch_fpu *fpu); +#else +static inline int kvm_own_lasx(struct kvm_vcpu *vcpu) { } +static inline void kvm_save_lasx(struct loongarch_fpu *fpu) { } +static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { } +#endif + void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index a400924c0348..4382e36ae3d4 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -385,6 +385,7 @@ SYM_FUNC_START(_restore_lasx_upper) lasx_restore_all_upper a0 t0 t1 jr ra SYM_FUNC_END(_restore_lasx_upper) +EXPORT_SYMBOL(_restore_lasx_upper) SYM_FUNC_START(_init_lasx_upper) lasx_init_all_upper t1 diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 676f7a3a335c..ed1d89d53e2e 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -670,6 +670,21 @@ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * kvm_handle_lasx_disabled() - Guest used LASX while disabled in root. + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use LASX when it is disabled in the root + * context. + */ +static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu) +{ + if (kvm_own_lasx(vcpu)) + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + /* * LoongArch KVM callback handling for unimplemented guest exiting */ @@ -699,6 +714,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { [EXCCODE_TLBM] = kvm_handle_write_fault, [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, [EXCCODE_LSXDIS] = kvm_handle_lsx_disabled, + [EXCCODE_LASXDIS] = kvm_handle_lasx_disabled, [EXCCODE_GSPR] = kvm_handle_gspr, }; diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 00fbf772d16f..ba976509bfe8 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -261,6 +261,21 @@ SYM_FUNC_START(kvm_restore_lsx) SYM_FUNC_END(kvm_restore_lsx) #endif +#ifdef CONFIG_CPU_HAS_LASX +SYM_FUNC_START(kvm_save_lasx) + fpu_save_csr a0 t1 + fpu_save_cc a0 t1 t2 + lasx_save_data a0 t1 + jr ra +SYM_FUNC_END(kvm_save_lasx) + +SYM_FUNC_START(kvm_restore_lasx) + lasx_restore_data a0 t1 + fpu_restore_cc a0 t1 t2 + fpu_restore_csr a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_restore_lasx) +#endif .section ".rodata" SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h index 7da4e230e896..c2484ad4cffa 100644 --- a/arch/loongarch/kvm/trace.h +++ b/arch/loongarch/kvm/trace.h @@ -103,6 +103,7 @@ TRACE_EVENT(kvm_exit_gspr, #define KVM_TRACE_AUX_FPU 1 #define KVM_TRACE_AUX_LSX 2 +#define KVM_TRACE_AUX_LASX 3 #define kvm_trace_symbol_aux_op \ { KVM_TRACE_AUX_SAVE, "save" }, \ @@ -113,7 +114,8 @@ TRACE_EVENT(kvm_exit_gspr, #define kvm_trace_symbol_aux_state \ { KVM_TRACE_AUX_FPU, "FPU" }, \ - { KVM_TRACE_AUX_LSX, "LSX" } + { KVM_TRACE_AUX_LSX, "LSX" }, \ + { KVM_TRACE_AUX_LASX, "LASX" } TRACE_EVENT(kvm_aux, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 80487d177ca4..27701991886d 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -317,6 +317,13 @@ static int _kvm_get_cpucfg(int id, u64 *v) */ if (cpu_has_lsx) *v |= CPUCFG2_LSX; + /* + * if LASX is supported by CPU, it is also supported by KVM, + * as we implement it. + */ + if (cpu_has_lasx) + *v |= CPUCFG2_LASX; + break; default: ret = -EINVAL; @@ -753,12 +760,54 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) } #endif +#ifdef CONFIG_CPU_HAS_LASX +/* Enable LASX and restore context */ +int kvm_own_lasx(struct kvm_vcpu *vcpu) +{ + if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch) || !kvm_guest_has_lasx(&vcpu->arch)) + return -EINVAL; + + preempt_disable(); + + set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN); + switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) { + case KVM_LARCH_LSX: + case KVM_LARCH_LSX | KVM_LARCH_FPU: + /* Guest LSX state already loaded, only restore upper LASX state */ + _restore_lasx_upper(&vcpu->arch.fpu); + break; + case KVM_LARCH_FPU: + /* Guest FP state already loaded, only restore upper LSX & LASX state */ + _restore_lsx_upper(&vcpu->arch.fpu); + _restore_lasx_upper(&vcpu->arch.fpu); + break; + default: + /* Neither FP or LSX already active, restore full LASX state */ + kvm_restore_lasx(&vcpu->arch.fpu); + break; + } + + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LASX); + vcpu->arch.aux_inuse |= KVM_LARCH_LASX | KVM_LARCH_LSX | KVM_LARCH_FPU; + preempt_enable(); + + return 0; +} +#endif + /* Save context and disable FPU */ void kvm_lose_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - if (vcpu->arch.aux_inuse & KVM_LARCH_LSX) { + if (vcpu->arch.aux_inuse & KVM_LARCH_LASX) { + kvm_save_lasx(&vcpu->arch.fpu); + vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU | KVM_LARCH_LASX); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_LASX); + + /* Disable LASX & LSX & FPU */ + clear_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN); + } else if (vcpu->arch.aux_inuse & KVM_LARCH_LSX) { kvm_save_lsx(&vcpu->arch.fpu); vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU); trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_LSX); -- cgit v1.2.3 From 2bfc654b89c4dd1c372bb2cbba6b5a0eb578d214 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 9 Nov 2023 15:47:49 +0000 Subject: arm64: cpufeatures: Restrict NV support to FEAT_NV2 To anyone who has played with FEAT_NV, it is obvious that the level of performance is rather low due to the trap amplification that it imposes on the host hypervisor. FEAT_NV2 solves a number of the problems that FEAT_NV had. It also turns out that all the existing hardware that has FEAT_NV also has FEAT_NV2. Finally, it is now allowed by the architecture to build FEAT_NV2 *only* (as denoted by ID_AA64MMFR4_EL1.NV_frac), which effectively seals the fate of FEAT_NV. Restrict the NV support to NV2, and be done with it. Nobody will cry over the old crap. NV_frac will eventually be supported once the intrastructure is ready. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646591c67e7a..1329e974d187 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2339,7 +2339,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, IMP) + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, -- cgit v1.2.3 From 111903d1f5b9334d1100e1c6ee08e740fa374d91 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Nov 2023 14:16:02 +0000 Subject: KVM: arm64: nv: Hoist vcpu_has_nv() into is_hyp_ctxt() A rather common idiom when writing NV code as part of KVM is to have things such has: if (vcpu_has_nv(vcpu) && is_hyp_ctxt(vcpu)) { [...] } to check that we are in a hyp-related context. The second part of the conjunction would be enough, but the first one contains a static key that allows the rest of the checkis to be elided when in a non-NV environment. Rewrite is_hyp_ctxt() to directly use vcpu_has_nv(). The result is the same, and the code easier to read. The one occurence of this that is already merged is rewritten in the process. In order to avoid nasty cirtular dependencies between kvm_emulate.h and kvm_nested.h, vcpu_has_feature() is itself hoisted into kvm_host.h, at the cost of some #deferry... Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 8 ++------ arch/arm64/include/asm/kvm_host.h | 7 +++++++ arch/arm64/kvm/arch_timer.c | 3 +-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 78a550537b67..84829b7e6f1d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -54,11 +55,6 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); -static inline bool vcpu_has_feature(const struct kvm_vcpu *vcpu, int feature) -{ - return test_bit(feature, vcpu->kvm->arch.vcpu_features); -} - #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { @@ -248,7 +244,7 @@ static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) { - return __is_hyp_ctxt(&vcpu->arch.ctxt); + return vcpu_has_nv(vcpu) && __is_hyp_ctxt(&vcpu->arch.ctxt); } /* diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 824f29f04916..4103a12ecaaf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1177,6 +1177,13 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_vm_has_ran_once(kvm) \ (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags)) +static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) +{ + return test_bit(feature, ka->vcpu_features); +} + +#define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) + int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 13ba691b848f..9dec8c419bf4 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -295,8 +295,7 @@ static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); struct arch_timer_context *ctx; - ctx = (vcpu_has_nv(vcpu) && is_hyp_ctxt(vcpu)) ? vcpu_hvtimer(vcpu) - : vcpu_vtimer(vcpu); + ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu); return kvm_counter_compute_delta(ctx, val); } -- cgit v1.2.3 From 3ed0b5123cd5a2a4f1fe4e594e7bf319e9eaf1da Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 12 Nov 2023 21:05:14 +0000 Subject: KVM: arm64: nv: Compute NV view of idregs as a one-off Now that we have a full copy of the idregs for each VM, there is no point in repainting the sysregs on each access. Instead, we can simply perform the transmation as a one-off and be done with it. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_nested.h | 6 +----- arch/arm64/kvm/arm.c | 6 ++++++ arch/arm64/kvm/nested.c | 22 +++++++++++++++------- arch/arm64/kvm/sys_regs.c | 2 -- 5 files changed, 23 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4103a12ecaaf..fce2e5f583a7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -306,6 +306,7 @@ struct kvm_arch { * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. */ #define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) +#define IDX_IDREG(idx) sys_reg(3, 0, 0, ((idx) >> 3) + 1, (idx) & Op2_mask) #define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 6cec8e9c6c91..249b03fc2cce 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -14,10 +14,6 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); -struct sys_reg_params; -struct sys_reg_desc; - -void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p, - const struct sys_reg_desc *r); +int kvm_init_nv_sysregs(struct kvm *kvm); #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e5f75f1f1085..b65df612b41b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -669,6 +669,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } + if (vcpu_has_nv(vcpu)) { + ret = kvm_init_nv_sysregs(vcpu->kvm); + if (ret) + return ret; + } + ret = kvm_timer_enable(vcpu); if (ret) return ret; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 042695a210ce..ba95d044bc98 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -23,13 +23,9 @@ * This list should get updated as new features get added to the NV * support, and new extension to the architecture. */ -void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static u64 limit_nv_id_reg(u32 id, u64 val) { - u32 id = reg_to_encoding(r); - u64 val, tmp; - - val = p->regval; + u64 tmp; switch (id) { case SYS_ID_AA64ISAR0_EL1: @@ -158,5 +154,17 @@ void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p, break; } - p->regval = val; + return val; +} +int kvm_init_nv_sysregs(struct kvm *kvm) +{ + mutex_lock(&kvm->arch.config_lock); + + for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++) + kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i), + kvm->arch.id_regs[i]); + + mutex_unlock(&kvm->arch.config_lock); + + return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4735e1b37fb3..3eae84195d48 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1505,8 +1505,6 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, return write_to_read_only(vcpu, p, r); p->regval = read_id_reg(vcpu, r); - if (vcpu_has_nv(vcpu)) - access_nested_id_reg(vcpu, p, r); return true; } -- cgit v1.2.3 From 4d4f52052ba8357f1591cb9bc3086541070711af Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 8 Nov 2023 19:10:12 +0000 Subject: KVM: arm64: nv: Drop EL12 register traps that are redirected to VNCR With FEAT_NV2, a bunch of system register writes are turned into memory writes. This is specially the fate of the EL12 registers that the guest hypervisor manipulates out of context. Remove the trap descriptors for those, as they are never going to be used again. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3eae84195d48..cdfc79ccc7a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2566,21 +2566,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), - EL12_REG(SCTLR, access_vm_reg, reset_val, 0x00C50078), - EL12_REG(CPACR, access_rw, reset_val, 0), - EL12_REG(TTBR0, access_vm_reg, reset_unknown, 0), - EL12_REG(TTBR1, access_vm_reg, reset_unknown, 0), - EL12_REG(TCR, access_vm_reg, reset_val, 0), - { SYS_DESC(SYS_SPSR_EL12), access_spsr}, - { SYS_DESC(SYS_ELR_EL12), access_elr}, - EL12_REG(AFSR0, access_vm_reg, reset_unknown, 0), - EL12_REG(AFSR1, access_vm_reg, reset_unknown, 0), - EL12_REG(ESR, access_vm_reg, reset_unknown, 0), - EL12_REG(FAR, access_vm_reg, reset_unknown, 0), - EL12_REG(MAIR, access_vm_reg, reset_unknown, 0), - EL12_REG(AMAIR, access_vm_reg, reset_amair_el1, 0), - EL12_REG(VBAR, access_rw, reset_val, 0), - EL12_REG(CONTEXTIDR, access_vm_reg, reset_val, 0), EL12_REG(CNTKCTL, access_rw, reset_val, 0), EL2_REG(SP_EL2, NULL, reset_unknown, 0), -- cgit v1.2.3 From 3606e0b2e462164bced151dbb54ccfe42ac6c35b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 25 Dec 2016 10:49:48 -0500 Subject: KVM: arm64: nv: Add non-VHE-EL2->EL1 translation helpers Some EL2 system registers immediately affect the current execution of the system, so we need to use their respective EL1 counterparts. For this we need to define a mapping between the two. In general, this only affects non-VHE guest hypervisors, as VHE system registers are compatible with the EL1 counterparts. These helpers will get used in subsequent patches. Reviewed-by: Oliver Upton Co-developed-by: Andre Przywara Signed-off-by: Andre Przywara Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 50 ++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 249b03fc2cce..4882905357f4 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -2,8 +2,9 @@ #ifndef __ARM64_KVM_NESTED_H #define __ARM64_KVM_NESTED_H -#include +#include #include +#include static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) { @@ -12,6 +13,53 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2)); } +/* Translation helpers from non-VHE EL2 to EL1 */ +static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) +{ + return (u64)FIELD_GET(TCR_EL2_PS_MASK, tcr_el2) << TCR_IPS_SHIFT; +} + +static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) +{ + return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ + ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | + tcr_el2_ps_to_tcr_el1_ips(tcr) | + (tcr & TCR_EL2_TG0_MASK) | + (tcr & TCR_EL2_ORGN0_MASK) | + (tcr & TCR_EL2_IRGN0_MASK) | + (tcr & TCR_EL2_T0SZ_MASK); +} + +static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) +{ + u64 cpacr_el1 = 0; + + if (cptr_el2 & CPTR_EL2_TTA) + cpacr_el1 |= CPACR_ELx_TTA; + if (!(cptr_el2 & CPTR_EL2_TFP)) + cpacr_el1 |= CPACR_ELx_FPEN; + if (!(cptr_el2 & CPTR_EL2_TZ)) + cpacr_el1 |= CPACR_ELx_ZEN; + + return cpacr_el1; +} + +static inline u64 translate_sctlr_el2_to_sctlr_el1(u64 val) +{ + /* Only preserve the minimal set of bits we support */ + val &= (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | SCTLR_ELx_SA | + SCTLR_ELx_I | SCTLR_ELx_IESB | SCTLR_ELx_WXN | SCTLR_ELx_EE); + val |= SCTLR_EL1_RES1; + + return val; +} + +static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) +{ + /* Clear the ASID field */ + return ttbr0 & ~GENMASK_ULL(63, 48); +} + extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); int kvm_init_nv_sysregs(struct kvm *kvm); -- cgit v1.2.3 From 60ce16cc122aad999129d23061fa35f63d5b1e9b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jun 2019 13:54:37 +0100 Subject: KVM: arm64: nv: Add include containing the VNCR_EL2 offsets VNCR_EL2 points to a page containing a number of system registers accessed by a guest hypervisor when ARMv8.4-NV is enabled. Let's document the offsets in that page, as we are going to use this layout. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/vncr_mapping.h | 103 ++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 arch/arm64/include/asm/vncr_mapping.h (limited to 'arch') diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h new file mode 100644 index 000000000000..df2c47c55972 --- /dev/null +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * System register offsets in the VNCR page + * All offsets are *byte* displacements! + */ + +#ifndef __ARM64_VNCR_MAPPING_H__ +#define __ARM64_VNCR_MAPPING_H__ + +#define VNCR_VTTBR_EL2 0x020 +#define VNCR_VTCR_EL2 0x040 +#define VNCR_VMPIDR_EL2 0x050 +#define VNCR_CNTVOFF_EL2 0x060 +#define VNCR_HCR_EL2 0x078 +#define VNCR_HSTR_EL2 0x080 +#define VNCR_VPIDR_EL2 0x088 +#define VNCR_TPIDR_EL2 0x090 +#define VNCR_HCRX_EL2 0x0A0 +#define VNCR_VNCR_EL2 0x0B0 +#define VNCR_CPACR_EL1 0x100 +#define VNCR_CONTEXTIDR_EL1 0x108 +#define VNCR_SCTLR_EL1 0x110 +#define VNCR_ACTLR_EL1 0x118 +#define VNCR_TCR_EL1 0x120 +#define VNCR_AFSR0_EL1 0x128 +#define VNCR_AFSR1_EL1 0x130 +#define VNCR_ESR_EL1 0x138 +#define VNCR_MAIR_EL1 0x140 +#define VNCR_AMAIR_EL1 0x148 +#define VNCR_MDSCR_EL1 0x158 +#define VNCR_SPSR_EL1 0x160 +#define VNCR_CNTV_CVAL_EL0 0x168 +#define VNCR_CNTV_CTL_EL0 0x170 +#define VNCR_CNTP_CVAL_EL0 0x178 +#define VNCR_CNTP_CTL_EL0 0x180 +#define VNCR_SCXTNUM_EL1 0x188 +#define VNCR_TFSR_EL1 0x190 +#define VNCR_HFGRTR_EL2 0x1B8 +#define VNCR_HFGWTR_EL2 0x1C0 +#define VNCR_HFGITR_EL2 0x1C8 +#define VNCR_HDFGRTR_EL2 0x1D0 +#define VNCR_HDFGWTR_EL2 0x1D8 +#define VNCR_ZCR_EL1 0x1E0 +#define VNCR_HAFGRTR_EL2 0x1E8 +#define VNCR_TTBR0_EL1 0x200 +#define VNCR_TTBR1_EL1 0x210 +#define VNCR_FAR_EL1 0x220 +#define VNCR_ELR_EL1 0x230 +#define VNCR_SP_EL1 0x240 +#define VNCR_VBAR_EL1 0x250 +#define VNCR_TCR2_EL1 0x270 +#define VNCR_PIRE0_EL1 0x290 +#define VNCR_PIRE0_EL2 0x298 +#define VNCR_PIR_EL1 0x2A0 +#define VNCR_ICH_LR0_EL2 0x400 +#define VNCR_ICH_LR1_EL2 0x408 +#define VNCR_ICH_LR2_EL2 0x410 +#define VNCR_ICH_LR3_EL2 0x418 +#define VNCR_ICH_LR4_EL2 0x420 +#define VNCR_ICH_LR5_EL2 0x428 +#define VNCR_ICH_LR6_EL2 0x430 +#define VNCR_ICH_LR7_EL2 0x438 +#define VNCR_ICH_LR8_EL2 0x440 +#define VNCR_ICH_LR9_EL2 0x448 +#define VNCR_ICH_LR10_EL2 0x450 +#define VNCR_ICH_LR11_EL2 0x458 +#define VNCR_ICH_LR12_EL2 0x460 +#define VNCR_ICH_LR13_EL2 0x468 +#define VNCR_ICH_LR14_EL2 0x470 +#define VNCR_ICH_LR15_EL2 0x478 +#define VNCR_ICH_AP0R0_EL2 0x480 +#define VNCR_ICH_AP0R1_EL2 0x488 +#define VNCR_ICH_AP0R2_EL2 0x490 +#define VNCR_ICH_AP0R3_EL2 0x498 +#define VNCR_ICH_AP1R0_EL2 0x4A0 +#define VNCR_ICH_AP1R1_EL2 0x4A8 +#define VNCR_ICH_AP1R2_EL2 0x4B0 +#define VNCR_ICH_AP1R3_EL2 0x4B8 +#define VNCR_ICH_HCR_EL2 0x4C0 +#define VNCR_ICH_VMCR_EL2 0x4C8 +#define VNCR_VDISR_EL2 0x500 +#define VNCR_PMBLIMITR_EL1 0x800 +#define VNCR_PMBPTR_EL1 0x810 +#define VNCR_PMBSR_EL1 0x820 +#define VNCR_PMSCR_EL1 0x828 +#define VNCR_PMSEVFR_EL1 0x830 +#define VNCR_PMSICR_EL1 0x838 +#define VNCR_PMSIRR_EL1 0x840 +#define VNCR_PMSLATFR_EL1 0x848 +#define VNCR_TRFCR_EL1 0x880 +#define VNCR_MPAM1_EL1 0x900 +#define VNCR_MPAMHCR_EL2 0x930 +#define VNCR_MPAMVPMV_EL2 0x938 +#define VNCR_MPAMVPM0_EL2 0x940 +#define VNCR_MPAMVPM1_EL2 0x948 +#define VNCR_MPAMVPM2_EL2 0x950 +#define VNCR_MPAMVPM3_EL2 0x958 +#define VNCR_MPAMVPM4_EL2 0x960 +#define VNCR_MPAMVPM5_EL2 0x968 +#define VNCR_MPAMVPM6_EL2 0x970 +#define VNCR_MPAMVPM7_EL2 0x978 + +#endif /* __ARM64_VNCR_MAPPING_H__ */ -- cgit v1.2.3 From 2733dd10701abc6ab23d65a732f58fbeb80bd203 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 6 Nov 2023 16:42:13 +0000 Subject: KVM: arm64: Introduce a bad_trap() primitive for unexpected trap handling In order to ease the debugging of NV, it is helpful to have the kernel shout at you when an unexpected trap is handled. We already have this in a couple of cases. Make this a more generic infrastructure that we will make use of very shortly. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index cdfc79ccc7a1..3709c35666a2 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -45,24 +45,31 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); -static bool read_from_write_only(struct kvm_vcpu *vcpu, - struct sys_reg_params *params, - const struct sys_reg_desc *r) +static bool bad_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r, + const char *msg) { - WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); + WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); return false; } +static bool read_from_write_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + return bad_trap(vcpu, params, r, + "sys_reg read to write-only register"); +} + static bool write_to_read_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { - WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n"); - print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return bad_trap(vcpu, params, r, + "sys_reg write to read-only register"); } u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) -- cgit v1.2.3 From 9b9cce60be85e6807bdb0eaa2f520e78dbab0659 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 7 Nov 2023 09:02:10 +0000 Subject: KVM: arm64: nv: Add EL2_REG_VNCR()/EL2_REG_REDIR() sysreg helpers Add two helpers to deal with EL2 registers are are either redirected to the VNCR page, or that are redirected to their EL1 counterpart. In either cases, no trap is expected. THe relevant register descriptors are repainted accordingly. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 65 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3709c35666a2..7fef170ec67b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1890,6 +1890,32 @@ static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static bool bad_vncr_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * VNCR page, and nothing else. + */ + return bad_trap(vcpu, p, r, + "trap of VNCR-backed register"); +} + +static bool bad_redir_trap(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* + * We really shouldn't be here, and this is likely the result + * of a misconfigured trap, as this register should target the + * corresponding EL1, and nothing else. + */ + return bad_trap(vcpu, p, r, + "trap of EL2 register redirected to EL1"); +} + #define EL2_REG(name, acc, rst, v) { \ SYS_DESC(SYS_##name), \ .access = acc, \ @@ -1899,6 +1925,9 @@ static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) +#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) + /* * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when * HCR_EL2.E2H==1, and only in the sysreg table for convenience of @@ -2513,32 +2542,32 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper, .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, - EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0), - EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0), + EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0), + EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), - EL2_REG(HCR_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(HCR_EL2, reset_val, 0), EL2_REG(MDCR_EL2, access_rw, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), - EL2_REG(HSTR_EL2, access_rw, reset_val, 0), - EL2_REG(HFGRTR_EL2, access_rw, reset_val, 0), - EL2_REG(HFGWTR_EL2, access_rw, reset_val, 0), - EL2_REG(HFGITR_EL2, access_rw, reset_val, 0), - EL2_REG(HACR_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(HSTR_EL2, reset_val, 0), + EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), + EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0), + EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), + EL2_REG_VNCR(HACR_EL2, reset_val, 0), - EL2_REG(HCRX_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), - EL2_REG(VTTBR_EL2, access_rw, reset_val, 0), - EL2_REG(VTCR_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), + EL2_REG_VNCR(VTCR_EL2, reset_val, 0), { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, - EL2_REG(HDFGRTR_EL2, access_rw, reset_val, 0), - EL2_REG(HDFGWTR_EL2, access_rw, reset_val, 0), - EL2_REG(SPSR_EL2, access_rw, reset_val, 0), - EL2_REG(ELR_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), + EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), + EL2_REG_REDIR(SPSR_EL2, reset_val, 0), + EL2_REG_REDIR(ELR_EL2, reset_val, 0), { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ @@ -2554,10 +2583,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), - EL2_REG(ESR_EL2, access_rw, reset_val, 0), + EL2_REG_REDIR(ESR_EL2, reset_val, 0), { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 }, - EL2_REG(FAR_EL2, access_rw, reset_val, 0), + EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), @@ -2570,7 +2599,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), - EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), + EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), EL12_REG(CNTKCTL, access_rw, reset_val, 0), -- cgit v1.2.3 From d8bd48e3f0ee9e1fdba2a2e453155a5354e48a8d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 26 Jun 2019 19:59:56 +0100 Subject: KVM: arm64: nv: Map VNCR-capable registers to a separate page With ARMv8.4-NV, registers that can be directly accessed in memory by the guest have to live at architected offsets in a special page. Let's annotate the sysreg enum to reflect the offset at which they are in this page, whith a little twist: If running on HW that doesn't have the ARMv8.4-NV feature, or even a VM that doesn't use NV, we store all the system registers in the usual sys_regs array. The only difference with the pre-8.4 situation is that VNCR-capable registers are at a "similar" offset as in the VNCR page (we can compute the actual offset at compile time), and that the sys_regs array is both bigger and sparse. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 127 ++++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fce2e5f583a7..9e8cd2bb95c3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -325,33 +326,33 @@ struct kvm_vcpu_fault_info { u64 disr_el1; /* Deferred [SError] Status Register */ }; +/* + * VNCR() just places the VNCR_capable registers in the enum after + * __VNCR_START__, and the value (after correction) to be an 8-byte offset + * from the VNCR base. As we don't require the enum to be otherwise ordered, + * we need the terrible hack below to ensure that we correctly size the + * sys_regs array, no matter what. + * + * The __MAX__ macro has been lifted from Sean Eron Anderson's wonderful + * treasure trove of bit hacks: + * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + */ +#define __MAX__(x,y) ((x) ^ (((x) ^ (y)) & -((x) < (y)))) +#define VNCR(r) \ + __before_##r, \ + r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ + __after_##r = __MAX__(__before_##r - 1, r) + enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ CLIDR_EL1, /* Cache Level ID Register */ CSSELR_EL1, /* Cache Size Selection Register */ - SCTLR_EL1, /* System Control Register */ - ACTLR_EL1, /* Auxiliary Control Register */ - CPACR_EL1, /* Coprocessor Access Control */ - ZCR_EL1, /* SVE Control */ - TTBR0_EL1, /* Translation Table Base Register 0 */ - TTBR1_EL1, /* Translation Table Base Register 1 */ - TCR_EL1, /* Translation Control Register */ - TCR2_EL1, /* Extended Translation Control Register */ - ESR_EL1, /* Exception Syndrome Register */ - AFSR0_EL1, /* Auxiliary Fault Status Register 0 */ - AFSR1_EL1, /* Auxiliary Fault Status Register 1 */ - FAR_EL1, /* Fault Address Register */ - MAIR_EL1, /* Memory Attribute Indirection Register */ - VBAR_EL1, /* Vector Base Address Register */ - CONTEXTIDR_EL1, /* Context ID Register */ TPIDR_EL0, /* Thread ID, User R/W */ TPIDRRO_EL0, /* Thread ID, User R/O */ TPIDR_EL1, /* Thread ID, Privileged */ - AMAIR_EL1, /* Aux Memory Attribute Indirection Register */ CNTKCTL_EL1, /* Timer Control Register (EL1) */ PAR_EL1, /* Physical Address Register */ - MDSCR_EL1, /* Monitor Debug System Control Register */ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */ OSLSR_EL1, /* OS Lock Status Register */ DISR_EL1, /* Deferred Interrupt Status Register */ @@ -382,26 +383,11 @@ enum vcpu_sysreg { APGAKEYLO_EL1, APGAKEYHI_EL1, - ELR_EL1, - SP_EL1, - SPSR_EL1, - - CNTVOFF_EL2, - CNTV_CVAL_EL0, - CNTV_CTL_EL0, - CNTP_CVAL_EL0, - CNTP_CTL_EL0, - /* Memory Tagging Extension registers */ RGSR_EL1, /* Random Allocation Tag Seed Register */ GCR_EL1, /* Tag Control Register */ - TFSR_EL1, /* Tag Fault Status Register (EL1) */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ - /* Permission Indirection Extension registers */ - PIR_EL1, /* Permission Indirection Register 1 (EL1) */ - PIRE0_EL1, /* Permission Indirection Register 0 (EL1) */ - /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -409,21 +395,14 @@ enum vcpu_sysreg { DBGVCR32_EL2, /* Debug Vector Catch Register */ /* EL2 registers */ - VPIDR_EL2, /* Virtualization Processor ID Register */ - VMPIDR_EL2, /* Virtualization Multiprocessor ID Register */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ - HCR_EL2, /* Hypervisor Configuration Register */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ - HSTR_EL2, /* Hypervisor System Trap Register */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ - HCRX_EL2, /* Extended Hypervisor Configuration Register */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ - VTTBR_EL2, /* Virtualization Translation Table Base Register */ - VTCR_EL2, /* Virtualization Translation Control Register */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ @@ -436,19 +415,61 @@ enum vcpu_sysreg { VBAR_EL2, /* Vector Base Address Register (EL2) */ RVBAR_EL2, /* Reset Vector Base Address Register */ CONTEXTIDR_EL2, /* Context ID Register (EL2) */ - TPIDR_EL2, /* EL2 Software Thread ID Register */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ SP_EL2, /* EL2 Stack Pointer */ - HFGRTR_EL2, - HFGWTR_EL2, - HFGITR_EL2, - HDFGRTR_EL2, - HDFGWTR_EL2, CNTHP_CTL_EL2, CNTHP_CVAL_EL2, CNTHV_CTL_EL2, CNTHV_CVAL_EL2, + __VNCR_START__, /* Any VNCR-capable reg goes after this point */ + + VNCR(SCTLR_EL1),/* System Control Register */ + VNCR(ACTLR_EL1),/* Auxiliary Control Register */ + VNCR(CPACR_EL1),/* Coprocessor Access Control */ + VNCR(ZCR_EL1), /* SVE Control */ + VNCR(TTBR0_EL1),/* Translation Table Base Register 0 */ + VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */ + VNCR(TCR_EL1), /* Translation Control Register */ + VNCR(TCR2_EL1), /* Extended Translation Control Register */ + VNCR(ESR_EL1), /* Exception Syndrome Register */ + VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */ + VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */ + VNCR(FAR_EL1), /* Fault Address Register */ + VNCR(MAIR_EL1), /* Memory Attribute Indirection Register */ + VNCR(VBAR_EL1), /* Vector Base Address Register */ + VNCR(CONTEXTIDR_EL1), /* Context ID Register */ + VNCR(AMAIR_EL1),/* Aux Memory Attribute Indirection Register */ + VNCR(MDSCR_EL1),/* Monitor Debug System Control Register */ + VNCR(ELR_EL1), + VNCR(SP_EL1), + VNCR(SPSR_EL1), + VNCR(TFSR_EL1), /* Tag Fault Status Register (EL1) */ + VNCR(VPIDR_EL2),/* Virtualization Processor ID Register */ + VNCR(VMPIDR_EL2),/* Virtualization Multiprocessor ID Register */ + VNCR(HCR_EL2), /* Hypervisor Configuration Register */ + VNCR(HSTR_EL2), /* Hypervisor System Trap Register */ + VNCR(VTTBR_EL2),/* Virtualization Translation Table Base Register */ + VNCR(VTCR_EL2), /* Virtualization Translation Control Register */ + VNCR(TPIDR_EL2),/* EL2 Software Thread ID Register */ + VNCR(HCRX_EL2), /* Extended Hypervisor Configuration Register */ + + /* Permission Indirection Extension registers */ + VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ + VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ + + VNCR(HFGRTR_EL2), + VNCR(HFGWTR_EL2), + VNCR(HFGITR_EL2), + VNCR(HDFGRTR_EL2), + VNCR(HDFGWTR_EL2), + + VNCR(CNTVOFF_EL2), + VNCR(CNTV_CVAL_EL0), + VNCR(CNTV_CTL_EL0), + VNCR(CNTP_CVAL_EL0), + VNCR(CNTP_CTL_EL0), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -465,6 +486,9 @@ struct kvm_cpu_context { u64 sys_regs[NR_SYS_REGS]; struct kvm_vcpu *__hyp_running_vcpu; + + /* This pointer has to be 4kB aligned. */ + u64 *vncr_array; }; struct kvm_host_data { @@ -827,8 +851,19 @@ struct kvm_vcpu_arch { * accessed by a running VCPU. For example, for userspace access or * for system registers that are never context switched, but only * emulated. + * + * Don't bother with VNCR-based accesses in the nVHE code, it has no + * business dealing with NV. */ -#define __ctxt_sys_reg(c,r) (&(c)->sys_regs[(r)]) +static inline u64 *__ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) +{ +#if !defined (__KVM_NVHE_HYPERVISOR__) + if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && + r >= __VNCR_START__ && ctxt->vncr_array)) + return &ctxt->vncr_array[r - __VNCR_START__]; +#endif + return (u64 *)&ctxt->sys_regs[r]; +} #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) -- cgit v1.2.3 From fedc612314acfebf506e071bf3a941076aa56d10 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 17 Dec 2022 13:28:40 +0000 Subject: KVM: arm64: nv: Handle virtual EL2 registers in vcpu_read/write_sys_reg() KVM internally uses accessor functions when reading or writing the guest's system registers. This takes care of accessing either the stored copy or using the "live" EL1 system registers when the host uses VHE. With the introduction of virtual EL2 we add a bunch of EL2 system registers, which now must also be taken care of: - If the guest is running in vEL2, and we access an EL1 sysreg, we must revert to the stored version of that, and not use the CPU's copy. - If the guest is running in vEL1, and we access an EL2 sysreg, we must also use the stored version, since the CPU carries the EL1 copy. - Some EL2 system registers are supposed to affect the current execution of the system, so we need to put them into their respective EL1 counterparts. For this we need to define a mapping between the two. - Some EL2 system registers have a different format than their EL1 counterpart, so we need to translate them before writing them to the CPU. This is done using an (optional) translate function in the map. All of these cases are now wrapped into the existing accessor functions, so KVM users wouldn't need to care whether they access EL2 or EL1 registers and also which state the guest is in. Reviewed-by: Ganapatrao Kulkarni Reviewed-by: Alexandru Elisei Reviewed-by: Russell King (Oracle) Reviewed-by: Oliver Upton Co-developed-by: Andre Przywara Signed-off-by: Andre Przywara Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/sys_regs.c | 129 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9e8cd2bb95c3..f17fb7c42973 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -907,6 +907,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; + case SPSR_EL1: *val = read_sysreg_s(SYS_SPSR_EL12); break; case PAR_EL1: *val = read_sysreg_par(); break; case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; @@ -951,6 +952,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; + case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7fef170ec67b..3198542dcded 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -72,24 +72,143 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, "sys_reg write to read-only register"); } +#define PURE_EL2_SYSREG(el2) \ + case el2: { \ + *el1r = el2; \ + return true; \ + } + +#define MAPPED_EL2_SYSREG(el2, el1, fn) \ + case el2: { \ + *xlate = fn; \ + *el1r = el1; \ + return true; \ + } + +static bool get_el2_to_el1_mapping(unsigned int reg, + unsigned int *el1r, u64 (**xlate)(u64)) +{ + switch (reg) { + PURE_EL2_SYSREG( VPIDR_EL2 ); + PURE_EL2_SYSREG( VMPIDR_EL2 ); + PURE_EL2_SYSREG( ACTLR_EL2 ); + PURE_EL2_SYSREG( HCR_EL2 ); + PURE_EL2_SYSREG( MDCR_EL2 ); + PURE_EL2_SYSREG( HSTR_EL2 ); + PURE_EL2_SYSREG( HACR_EL2 ); + PURE_EL2_SYSREG( VTTBR_EL2 ); + PURE_EL2_SYSREG( VTCR_EL2 ); + PURE_EL2_SYSREG( RVBAR_EL2 ); + PURE_EL2_SYSREG( TPIDR_EL2 ); + PURE_EL2_SYSREG( HPFAR_EL2 ); + PURE_EL2_SYSREG( CNTHCTL_EL2 ); + MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, + translate_sctlr_el2_to_sctlr_el1 ); + MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, + translate_cptr_el2_to_cpacr_el1 ); + MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1, + translate_ttbr0_el2_to_ttbr0_el1 ); + MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1, + translate_tcr_el2_to_tcr_el1 ); + MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL ); + MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL ); + MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); + MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); + MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); + MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); + default: + return false; + } +} + u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; + u64 (*xlate)(u64) = NULL; + unsigned int el1r; + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + goto memory_read; - if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && - __vcpu_read_sys_reg_from_cpu(reg, &val)) + if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { + if (!is_hyp_ctxt(vcpu)) + goto memory_read; + + /* + * If this register does not have an EL1 counterpart, + * then read the stored EL2 version. + */ + if (reg == el1r) + goto memory_read; + + /* + * If we have a non-VHE guest and that the sysreg + * requires translation to be used at EL1, use the + * in-memory copy instead. + */ + if (!vcpu_el2_e2h_is_set(vcpu) && xlate) + goto memory_read; + + /* Get the current version of the EL1 counterpart. */ + WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); + return val; + } + + /* EL1 register can't be on the CPU if the guest is in vEL2. */ + if (unlikely(is_hyp_ctxt(vcpu))) + goto memory_read; + + if (__vcpu_read_sys_reg_from_cpu(reg, &val)) return val; +memory_read: return __vcpu_sys_reg(vcpu, reg); } void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && - __vcpu_write_sys_reg_to_cpu(val, reg)) + u64 (*xlate)(u64) = NULL; + unsigned int el1r; + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + goto memory_write; + + if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { + if (!is_hyp_ctxt(vcpu)) + goto memory_write; + + /* + * Always store a copy of the write to memory to avoid having + * to reverse-translate virtual EL2 system registers for a + * non-VHE guest hypervisor. + */ + __vcpu_sys_reg(vcpu, reg) = val; + + /* No EL1 counterpart? We're done here.? */ + if (reg == el1r) + return; + + if (!vcpu_el2_e2h_is_set(vcpu) && xlate) + val = xlate(val); + + /* Redirect this to the EL1 version of the register. */ + WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r)); + return; + } + + /* EL1 register can't be on the CPU if the guest is in vEL2. */ + if (unlikely(is_hyp_ctxt(vcpu))) + goto memory_write; + + if (__vcpu_write_sys_reg_to_cpu(val, reg)) return; - __vcpu_sys_reg(vcpu, reg) = val; +memory_write: + __vcpu_sys_reg(vcpu, reg) = val; } /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ -- cgit v1.2.3 From 7b95382f965133ef61ce44aaabc518c16eb46909 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Dec 2023 11:15:09 +0000 Subject: KVM: arm64: vgic-v4: Restore pending state on host userspace write When the VMM writes to ISPENDR0 to set the state pending state of an SGI, we fail to convey this to the HW if this SGI is already backed by a GICv4.1 vSGI. This is a bit of a corner case, as this would only occur if the vgic state is changed on an already running VM, but this can apparently happen across a guest reset driven by the VMM. Fix this by always writing out the pending_latch value to the HW, and reseting it to false. Reported-by: Kunkun Jiang Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Cc: stable@vger.kernel.org # 5.10+ Link: https://lore.kernel.org/r/7e7f2c0c-448b-10a9-8929-4b8f4f6e2a32@huawei.com --- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 89117ba2528a..111bd7f42729 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -365,19 +365,26 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); - if (test_bit(i, &val)) { - /* - * pending_latch is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before pending info. - */ - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - } else { + + /* + * pending_latch is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before pending info. + */ + irq->pending_latch = test_bit(i, &val); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); irq->pending_latch = false; - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } + if (irq->pending_latch) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); } -- cgit v1.2.3 From 13886f34444596e6eca124677cd8362a941b585b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 19 Dec 2023 06:58:53 +0000 Subject: KVM: arm64: vgic: Use common accessor for writes to ISPENDR Perhaps unsurprisingly, there is a considerable amount of duplicate code between the MMIO and user accessors for ISPENDR. At the same time there are some important differences between user and guest MMIO, like how SGIs can only be made pending from userspace. Fold user and MMIO accessors into a common helper, maintaining the distinction between the two. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231219065855.1019608-2-oliver.upton@linux.dev --- arch/arm64/kvm/vgic/vgic-mmio.c | 50 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index ff558c05e990..273912083056 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -301,9 +301,8 @@ static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); } -void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) +static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, + unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; @@ -312,14 +311,22 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - /* GICD_ISPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { + /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to inject it */ int err; @@ -335,7 +342,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, } irq->pending_latch = true; - if (irq->hw) + if (irq->hw && !is_user) vgic_irq_set_phys_active(irq, true); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); @@ -343,33 +350,18 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, } } +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __set_pending(vcpu, addr, len, val, false); +} + int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - - /* - * GICv2 SGIs are terribly broken. We can't restore - * the source of the interrupt, so just pick the vcpu - * itself as the source... - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source |= BIT(vcpu->vcpu_id); - - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - + __set_pending(vcpu, addr, len, val, true); return 0; } -- cgit v1.2.3 From 561851424d93e91083df4071781b68dc4ba1fc5a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 19 Dec 2023 06:58:54 +0000 Subject: KVM: arm64: vgic: Use common accessor for writes to ICPENDR Fold MMIO and user accessors into a common helper while maintaining the distinction between the two. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231219065855.1019608-3-oliver.upton@linux.dev --- arch/arm64/kvm/vgic/vgic-mmio.c | 51 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 273912083056..cf76523a2194 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -386,9 +386,9 @@ static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) vgic_irq_set_phys_active(irq, false); } -void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) +static void __clear_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; @@ -397,14 +397,22 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - /* GICD_ICPENDR0 SGI bits are WI */ - if (is_vgic_v2_sgi(vcpu, irq)) { + /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ + if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to clear its pending bit */ int err; @@ -419,7 +427,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, continue; } - if (irq->hw) + if (irq->hw && !is_user) vgic_hw_irq_cpending(vcpu, irq); else irq->pending_latch = false; @@ -429,33 +437,18 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } } +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __clear_pending(vcpu, addr, len, val, false); +} + int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for_each_set_bit(i, &val, len * 8) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - /* - * More fun with GICv2 SGIs! If we're clearing one of them - * from userspace, which source vcpu to clear? Let's not - * even think of it, and blow the whole set. - */ - if (is_vgic_v2_sgi(vcpu, irq)) - irq->source = 0; - - irq->pending_latch = false; - - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - vgic_put_irq(vcpu->kvm, irq); - } - + __clear_pending(vcpu, addr, len, val, true); return 0; } -- cgit v1.2.3 From 39084ba8d0fceb477a264e2bb8dfd3553876b84c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 19 Dec 2023 06:58:55 +0000 Subject: KVM: arm64: vgic-v3: Reinterpret user ISPENDR writes as I{C,S}PENDR User writes to ISPENDR for GICv3 are treated specially, as zeroes actually clear the pending state for interrupts (unlike HW). Reimplement it using the ISPENDR and ICPENDR user accessors. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231219065855.1019608-4-oliver.upton@linux.dev --- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 111bd7f42729..2962ccd8013a 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -357,38 +357,13 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - int i; - unsigned long flags; - - for (i = 0; i < len * 8; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - - raw_spin_lock_irqsave(&irq->irq_lock, flags); - - /* - * pending_latch is set irrespective of irq type - * (level or edge) to avoid dependency that VM should - * restore irq config before pending info. - */ - irq->pending_latch = test_bit(i, &val); - - if (irq->hw && vgic_irq_is_sgi(irq->intid)) { - irq_set_irqchip_state(irq->host_irq, - IRQCHIP_STATE_PENDING, - irq->pending_latch); - irq->pending_latch = false; - } - - if (irq->pending_latch) - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); - else - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + int ret; - vgic_put_irq(vcpu->kvm, irq); - } + ret = vgic_uaccess_write_spending(vcpu, addr, len, val); + if (ret) + return ret; - return 0; + return vgic_uaccess_write_cpending(vcpu, addr, len, ~val); } /* We want to avoid outer shareable. */ -- cgit v1.2.3 From 2731d605d5478052a10ac5a7c80f7aa7e1788cc5 Mon Sep 17 00:00:00 2001 From: Nina Schoetterl-Glausch Date: Tue, 19 Dec 2023 15:08:50 +0100 Subject: KVM: s390: vsie: Fix STFLE interpretive execution identification STFLE can be interpretively executed. This occurs when the facility list designation is unequal to zero. Perform the check before applying the address mask instead of after. Fixes: 66b630d5b7f2 ("KVM: s390: vsie: support STFLE interpretation") Reviewed-by: Claudio Imbrenda Acked-by: David Hildenbrand Signed-off-by: Nina Schoetterl-Glausch Reviewed-by: Christian Borntraeger Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20231219140854.1042599-2-nsg@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20231219140854.1042599-2-nsg@linux.ibm.com> --- arch/s390/kvm/vsie.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 02dcbe82a8e5..3cf95bc0401d 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -988,10 +988,15 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page) static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac); if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); + /* + * The facility list origin (FLO) is in bits 1 - 28 of the FLD + * so we need to mask here before reading. + */ + fac = fac & 0x7ffffff8U; if (read_guest_real(vcpu, fac, &vsie_page->fac, sizeof(vsie_page->fac))) return set_validity_icpt(scb_s, 0x1090U); -- cgit v1.2.3 From 682dbf430d27bc0e23d8d6921116b4f77f5dc9c6 Mon Sep 17 00:00:00 2001 From: Nina Schoetterl-Glausch Date: Tue, 19 Dec 2023 15:08:51 +0100 Subject: KVM: s390: vsie: Fix length of facility list shadowed The length of the facility list accessed when interpretively executing STFLE is the same as the hosts facility list (in case of format-0) The memory following the facility list doesn't need to be accessible. The current VSIE implementation accesses a fixed length that exceeds the guest/host facility list length and can therefore wrongly inject a validity intercept. Instead, find out the host facility list length by running STFLE and copy only as much as necessary when shadowing. Acked-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Acked-by: Heiko Carstens Signed-off-by: Nina Schoetterl-Glausch Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20231219140854.1042599-3-nsg@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20231219140854.1042599-3-nsg@linux.ibm.com> --- arch/s390/include/asm/facility.h | 6 ++++++ arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/facility.c | 21 +++++++++++++++++++++ arch/s390/kvm/vsie.c | 12 +++++++++++- 4 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 arch/s390/kernel/facility.c (limited to 'arch') diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index 94b6919026df..796007125dff 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -111,4 +111,10 @@ static inline void stfle(u64 *stfle_fac_list, int size) preempt_enable(); } +/** + * stfle_size - Actual size of the facility list as specified by stfle + * (number of double words) + */ +unsigned int stfle_size(void); + #endif /* __ASM_FACILITY_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 353def93973b..7a562b4199c8 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -41,7 +41,7 @@ obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o extra-y += vmlinux.lds diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c new file mode 100644 index 000000000000..f02127219a27 --- /dev/null +++ b/arch/s390/kernel/facility.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2023 + */ + +#include + +unsigned int stfle_size(void) +{ + static unsigned int size; + unsigned int r; + u64 dummy; + + r = READ_ONCE(size); + if (!r) { + r = __stfle_asm(&dummy, 1) + 1; + WRITE_ONCE(size, r); + } + return r; +} +EXPORT_SYMBOL(stfle_size); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 3cf95bc0401d..aa8f4ab11e33 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -990,6 +991,10 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; __u32 fac = READ_ONCE(vsie_page->scb_o->fac); + /* + * Alternate-STFLE-Interpretive-Execution facilities are not supported + * -> format-0 flcb + */ if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); /* @@ -997,8 +1002,13 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * so we need to mask here before reading. */ fac = fac & 0x7ffffff8U; + /* + * format-0 -> size of nested guest's facility list == guest's size + * guest's size == host's size, since STFLE is interpretatively executed + * using a format-0 for the guest, too. + */ if (read_guest_real(vcpu, fac, &vsie_page->fac, - sizeof(vsie_page->fac))) + stfle_size() * sizeof(u64))) return set_validity_icpt(scb_s, 0x1090U); scb_s->fac = (__u32)(__u64) &vsie_page->fac; } -- cgit v1.2.3 From 10f7b1dcdfe05efcd26e90e337daf1bfd8f4a6da Mon Sep 17 00:00:00 2001 From: Nina Schoetterl-Glausch Date: Tue, 19 Dec 2023 15:08:52 +0100 Subject: KVM: s390: cpu model: Use proper define for facility mask size Use the previously unused S390_ARCH_FAC_MASK_SIZE_U64 instead of S390_ARCH_FAC_LIST_SIZE_U64 for defining the fac_mask array. Note that both values are the same, there is no functional change. Reviewed-by: Claudio Imbrenda Reviewed-by: David Hildenbrand Reviewed-by: Janosch Frank Signed-off-by: Nina Schoetterl-Glausch Link: https://lore.kernel.org/r/20231219140854.1042599-4-nsg@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20231219140854.1042599-4-nsg@linux.ibm.com> --- arch/s390/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 67a298b6cf6e..52664105a473 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -818,7 +818,7 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ - __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + __u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64]; struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; -- cgit v1.2.3 From 683c5bbbf6aea247bc95a7eb9fdfba4fcc8c909a Mon Sep 17 00:00:00 2001 From: Clément Léger Date: Tue, 24 Oct 2023 15:26:54 +0200 Subject: riscv: kvm: Use SYM_*() assembly macros instead of deprecated ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ENTRY()/END()/WEAK() macros are deprecated and we should make use of the new SYM_*() macros [1] for better annotation of symbols. Replace the deprecated ones with the new ones and fix wrong usage of END()/ENDPROC() to correctly describe the symbols. [1] https://docs.kernel.org/core-api/asm-annotations.html Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Acked-by: Palmer Dabbelt Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_switch.S | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S index d74df8eb4d71..8b18473780ac 100644 --- a/arch/riscv/kvm/vcpu_switch.S +++ b/arch/riscv/kvm/vcpu_switch.S @@ -15,7 +15,7 @@ .altmacro .option norelax -ENTRY(__kvm_riscv_switch_to) +SYM_FUNC_START(__kvm_riscv_switch_to) /* Save Host GPRs (except A0 and T0-T6) */ REG_S ra, (KVM_ARCH_HOST_RA)(a0) REG_S sp, (KVM_ARCH_HOST_SP)(a0) @@ -208,9 +208,9 @@ __kvm_switch_return: /* Return to C code */ ret -ENDPROC(__kvm_riscv_switch_to) +SYM_FUNC_END(__kvm_riscv_switch_to) -ENTRY(__kvm_riscv_unpriv_trap) +SYM_CODE_START(__kvm_riscv_unpriv_trap) /* * We assume that faulting unpriv load/store instruction is * 4-byte long and blindly increment SEPC by 4. @@ -231,12 +231,10 @@ ENTRY(__kvm_riscv_unpriv_trap) csrr a1, CSR_HTINST REG_S a1, (KVM_ARCH_TRAP_HTINST)(a0) sret -ENDPROC(__kvm_riscv_unpriv_trap) +SYM_CODE_END(__kvm_riscv_unpriv_trap) #ifdef CONFIG_FPU - .align 3 - .global __kvm_riscv_fp_f_save -__kvm_riscv_fp_f_save: +SYM_FUNC_START(__kvm_riscv_fp_f_save) csrr t2, CSR_SSTATUS li t1, SR_FS csrs CSR_SSTATUS, t1 @@ -276,10 +274,9 @@ __kvm_riscv_fp_f_save: sw t0, KVM_ARCH_FP_F_FCSR(a0) csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_f_save) - .align 3 - .global __kvm_riscv_fp_d_save -__kvm_riscv_fp_d_save: +SYM_FUNC_START(__kvm_riscv_fp_d_save) csrr t2, CSR_SSTATUS li t1, SR_FS csrs CSR_SSTATUS, t1 @@ -319,10 +316,9 @@ __kvm_riscv_fp_d_save: sw t0, KVM_ARCH_FP_D_FCSR(a0) csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_d_save) - .align 3 - .global __kvm_riscv_fp_f_restore -__kvm_riscv_fp_f_restore: +SYM_FUNC_START(__kvm_riscv_fp_f_restore) csrr t2, CSR_SSTATUS li t1, SR_FS lw t0, KVM_ARCH_FP_F_FCSR(a0) @@ -362,10 +358,9 @@ __kvm_riscv_fp_f_restore: fscsr t0 csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_f_restore) - .align 3 - .global __kvm_riscv_fp_d_restore -__kvm_riscv_fp_d_restore: +SYM_FUNC_START(__kvm_riscv_fp_d_restore) csrr t2, CSR_SSTATUS li t1, SR_FS lw t0, KVM_ARCH_FP_D_FCSR(a0) @@ -405,4 +400,5 @@ __kvm_riscv_fp_d_restore: fscsr t0 csrw CSR_SSTATUS, t2 ret +SYM_FUNC_END(__kvm_riscv_fp_d_restore) #endif -- cgit v1.2.3 From e5ff012743cbc3cf13d2243aaaf032a2ca4d0791 Mon Sep 17 00:00:00 2001 From: Clément Léger Date: Tue, 24 Oct 2023 15:26:55 +0200 Subject: riscv: kvm: use ".L" local labels in assembly when applicable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the sake of coherency, use local labels in assembly when applicable. This also avoid kprobes being confused when applying a kprobe since the size of function is computed by checking where the next visible symbol is located. This might end up in computing some function size to be way shorter than expected and thus failing to apply kprobes to the specified offset. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Acked-by: Palmer Dabbelt Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_switch.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S index 8b18473780ac..0c26189aa01c 100644 --- a/arch/riscv/kvm/vcpu_switch.S +++ b/arch/riscv/kvm/vcpu_switch.S @@ -45,7 +45,7 @@ SYM_FUNC_START(__kvm_riscv_switch_to) REG_L t0, (KVM_ARCH_GUEST_SSTATUS)(a0) REG_L t1, (KVM_ARCH_GUEST_HSTATUS)(a0) REG_L t2, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) - la t4, __kvm_switch_return + la t4, .Lkvm_switch_return REG_L t5, (KVM_ARCH_GUEST_SEPC)(a0) /* Save Host and Restore Guest SSTATUS */ @@ -113,7 +113,7 @@ SYM_FUNC_START(__kvm_riscv_switch_to) /* Back to Host */ .align 2 -__kvm_switch_return: +.Lkvm_switch_return: /* Swap Guest A0 with SSCRATCH */ csrrw a0, CSR_SSCRATCH, a0 -- cgit v1.2.3 From bcd08e9bae57b5585e438b7fa58aba4b145a59cf Mon Sep 17 00:00:00 2001 From: Chao Du Date: Mon, 11 Dec 2023 09:40:14 +0000 Subject: RISC-V: KVM: remove a redundant condition in kvm_arch_vcpu_ioctl_run() The latest ret value is updated by kvm_riscv_vcpu_aia_update(), the loop will continue if the ret is less than or equal to zero. So the later condition will never hit. Thus remove it. Signed-off-by: Chao Du Reviewed-by: Daniel Henrique Barboza Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index e087c809073c..bf3952d1a621 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -757,8 +757,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Update HVIP CSR for current CPU */ kvm_riscv_update_hvip(vcpu); - if (ret <= 0 || - kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || + if (kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { vcpu->mode = OUTSIDE_GUEST_MODE; -- cgit v1.2.3 From 7f58de96aa5e871dd553499e2c84fc801658eab6 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 13 Dec 2023 18:09:53 +0100 Subject: RISC-V: KVM: Don't add SBI multi regs in get-reg-list The multi regs are derived from the single registers. Only list the single registers in get-reg-list. This also makes the SBI extension register listing consistent with the ISA extension register listing. Signed-off-by: Andrew Jones Reviewed-by: Haibo Xu Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index f8c9fa0c03c5..f9bfa8a5db21 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -933,20 +933,12 @@ static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu) static inline unsigned long num_sbi_ext_regs(void) { - /* - * number of KVM_REG_RISCV_SBI_SINGLE + - * 2 x (number of KVM_REG_RISCV_SBI_MULTI) - */ - return KVM_RISCV_SBI_EXT_MAX + 2*(KVM_REG_RISCV_SBI_MULTI_REG_LAST+1); + return KVM_RISCV_SBI_EXT_MAX; } static int copy_sbi_ext_reg_indices(u64 __user *uindices) { - int n; - - /* copy KVM_REG_RISCV_SBI_SINGLE */ - n = KVM_RISCV_SBI_EXT_MAX; - for (int i = 0; i < n; i++) { + for (int i = 0; i < KVM_RISCV_SBI_EXT_MAX; i++) { u64 size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT | @@ -959,30 +951,6 @@ static int copy_sbi_ext_reg_indices(u64 __user *uindices) } } - /* copy KVM_REG_RISCV_SBI_MULTI */ - n = KVM_REG_RISCV_SBI_MULTI_REG_LAST + 1; - for (int i = 0; i < n; i++) { - u64 size = IS_ENABLED(CONFIG_32BIT) ? - KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; - u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT | - KVM_REG_RISCV_SBI_MULTI_EN | i; - - if (uindices) { - if (put_user(reg, uindices)) - return -EFAULT; - uindices++; - } - - reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT | - KVM_REG_RISCV_SBI_MULTI_DIS | i; - - if (uindices) { - if (put_user(reg, uindices)) - return -EFAULT; - uindices++; - } - } - return num_sbi_ext_regs(); } -- cgit v1.2.3 From 23e1dc45022eb65529aa30b1851a8d21a639c8f5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 13 Dec 2023 18:09:55 +0100 Subject: RISC-V: KVM: Make SBI uapi consistent with ISA uapi When an SBI extension cannot be enabled, that's a distinct state vs. enabled and disabled. Modify enum kvm_riscv_sbi_ext_status to accommodate it, which allows KVM userspace to tell the difference in state too, as the SBI extension register will disappear when it cannot be enabled, i.e. accesses to it return ENOENT. get-reg-list is updated as well to only add SBI extension registers to the list which may be enabled. Returning ENOENT for SBI extension registers which cannot be enabled makes them consistent with ISA extension registers. Any SBI extensions which were enabled by default are still enabled by default, if they can be enabled at all. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 10 +++-- arch/riscv/kvm/vcpu_onereg.c | 23 +++++++---- arch/riscv/kvm/vcpu_sbi.c | 75 ++++++++++++++++++++--------------- arch/riscv/kvm/vcpu_sbi_replace.c | 2 +- 4 files changed, 65 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 6a453f7f8b56..bffda0ac59b6 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -15,9 +15,10 @@ #define KVM_SBI_VERSION_MINOR 0 enum kvm_riscv_sbi_ext_status { - KVM_RISCV_SBI_EXT_UNINITIALIZED, - KVM_RISCV_SBI_EXT_AVAILABLE, - KVM_RISCV_SBI_EXT_UNAVAILABLE, + KVM_RISCV_SBI_EXT_STATUS_UNINITIALIZED, + KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE, + KVM_RISCV_SBI_EXT_STATUS_ENABLED, + KVM_RISCV_SBI_EXT_STATUS_DISABLED, }; struct kvm_vcpu_sbi_context { @@ -36,7 +37,7 @@ struct kvm_vcpu_sbi_extension { unsigned long extid_start; unsigned long extid_end; - bool default_unavail; + bool default_disabled; /** * SBI extension handler. It can be defined for a given extension or group of @@ -61,6 +62,7 @@ int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( struct kvm_vcpu *vcpu, unsigned long extid); +bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx); int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index f9bfa8a5db21..48262be73aa0 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -931,27 +931,34 @@ static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu) return copy_isa_ext_reg_indices(vcpu, NULL);; } -static inline unsigned long num_sbi_ext_regs(void) +static int copy_sbi_ext_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - return KVM_RISCV_SBI_EXT_MAX; -} + unsigned int n = 0; -static int copy_sbi_ext_reg_indices(u64 __user *uindices) -{ for (int i = 0; i < KVM_RISCV_SBI_EXT_MAX; i++) { u64 size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | i; + if (!riscv_vcpu_supports_sbi_ext(vcpu, i)) + continue; + if (uindices) { if (put_user(reg, uindices)) return -EFAULT; uindices++; } + + n++; } - return num_sbi_ext_regs(); + return n; +} + +static unsigned long num_sbi_ext_regs(struct kvm_vcpu *vcpu) +{ + return copy_sbi_ext_reg_indices(vcpu, NULL); } /* @@ -970,7 +977,7 @@ unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu) res += num_fp_f_regs(vcpu); res += num_fp_d_regs(vcpu); res += num_isa_ext_regs(vcpu); - res += num_sbi_ext_regs(); + res += num_sbi_ext_regs(vcpu); return res; } @@ -1018,7 +1025,7 @@ int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, return ret; uindices += ret; - ret = copy_sbi_ext_reg_indices(uindices); + ret = copy_sbi_ext_reg_indices(vcpu, uindices); if (ret < 0) return ret; diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index a04ff98085d9..dcdff4458190 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -80,6 +80,34 @@ static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { }, }; +static const struct kvm_riscv_sbi_extension_entry * +riscv_vcpu_get_sbi_ext(struct kvm_vcpu *vcpu, unsigned long idx) +{ + const struct kvm_riscv_sbi_extension_entry *sext = NULL; + + if (idx >= KVM_RISCV_SBI_EXT_MAX) + return NULL; + + for (int i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + if (sbi_ext[i].ext_idx == idx) { + sext = &sbi_ext[i]; + break; + } + } + + return sext; +} + +bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *sext; + + sext = riscv_vcpu_get_sbi_ext(vcpu, idx); + + return sext && scontext->ext_status[sext->ext_idx] != KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; +} + void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -140,28 +168,19 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, unsigned long reg_num, unsigned long reg_val) { - unsigned long i; - const struct kvm_riscv_sbi_extension_entry *sext = NULL; struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; - - if (reg_num >= KVM_RISCV_SBI_EXT_MAX) - return -ENOENT; + const struct kvm_riscv_sbi_extension_entry *sext; if (reg_val != 1 && reg_val != 0) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - if (sbi_ext[i].ext_idx == reg_num) { - sext = &sbi_ext[i]; - break; - } - } - if (!sext) + sext = riscv_vcpu_get_sbi_ext(vcpu, reg_num); + if (!sext || scontext->ext_status[sext->ext_idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE) return -ENOENT; scontext->ext_status[sext->ext_idx] = (reg_val) ? - KVM_RISCV_SBI_EXT_AVAILABLE : - KVM_RISCV_SBI_EXT_UNAVAILABLE; + KVM_RISCV_SBI_EXT_STATUS_ENABLED : + KVM_RISCV_SBI_EXT_STATUS_DISABLED; return 0; } @@ -170,24 +189,16 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, unsigned long reg_num, unsigned long *reg_val) { - unsigned long i; - const struct kvm_riscv_sbi_extension_entry *sext = NULL; struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *sext; - if (reg_num >= KVM_RISCV_SBI_EXT_MAX) - return -ENOENT; - - for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - if (sbi_ext[i].ext_idx == reg_num) { - sext = &sbi_ext[i]; - break; - } - } - if (!sext) + sext = riscv_vcpu_get_sbi_ext(vcpu, reg_num); + if (!sext || scontext->ext_status[sext->ext_idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE) return -ENOENT; *reg_val = scontext->ext_status[sext->ext_idx] == - KVM_RISCV_SBI_EXT_AVAILABLE; + KVM_RISCV_SBI_EXT_STATUS_ENABLED; + return 0; } @@ -325,7 +336,7 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( if (ext->extid_start <= extid && ext->extid_end >= extid) { if (entry->ext_idx >= KVM_RISCV_SBI_EXT_MAX || scontext->ext_status[entry->ext_idx] == - KVM_RISCV_SBI_EXT_AVAILABLE) + KVM_RISCV_SBI_EXT_STATUS_ENABLED) return ext; return NULL; @@ -413,12 +424,12 @@ void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu) if (ext->probe && !ext->probe(vcpu)) { scontext->ext_status[entry->ext_idx] = - KVM_RISCV_SBI_EXT_UNAVAILABLE; + KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; continue; } - scontext->ext_status[entry->ext_idx] = ext->default_unavail ? - KVM_RISCV_SBI_EXT_UNAVAILABLE : - KVM_RISCV_SBI_EXT_AVAILABLE; + scontext->ext_status[entry->ext_idx] = ext->default_disabled ? + KVM_RISCV_SBI_EXT_STATUS_DISABLED : + KVM_RISCV_SBI_EXT_STATUS_ENABLED; } } diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 23b57c931b15..9c2ab3dfa93a 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -204,6 +204,6 @@ static int kvm_sbi_ext_dbcn_handler(struct kvm_vcpu *vcpu, const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { .extid_start = SBI_EXT_DBCN, .extid_end = SBI_EXT_DBCN, - .default_unavail = true, + .default_disabled = true, .handler = kvm_sbi_ext_dbcn_handler, }; -- cgit v1.2.3 From 197bd237b67268651ac544e8fedbe1fd275d41e0 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 5 Dec 2023 14:45:07 -0300 Subject: RISC-V: KVM: set 'vlenb' in kvm_riscv_vcpu_alloc_vector_context() 'vlenb', added to riscv_v_ext_state by commit c35f3aa34509 ("RISC-V: vector: export VLENB csr in __sc_riscv_v_state"), isn't being initialized in guest_context. If we export 'vlenb' as a KVM CSR, something we want to do in the next patch, it'll always return 0. Set 'vlenb' to riscv_v_size/32. Signed-off-by: Daniel Henrique Barboza Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_vector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index b339a2682f25..530e49c588d6 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -76,6 +76,7 @@ int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu, cntx->vector.datap = kmalloc(riscv_v_vsize, GFP_KERNEL); if (!cntx->vector.datap) return -ENOMEM; + cntx->vector.vlenb = riscv_v_vsize / 32; vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); if (!vcpu->arch.host_context.vector.datap) -- cgit v1.2.3 From 2fa290372dfe7dd248b1c16f943f273a3e674f22 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 5 Dec 2023 14:45:08 -0300 Subject: RISC-V: KVM: add 'vlenb' Vector CSR Userspace requires 'vlenb' to be able to encode it in reg ID. Otherwise it is not possible to retrieve any vector reg since we're returning EINVAL if reg_size isn't vlenb (see kvm_riscv_vcpu_vreg_addr()). Signed-off-by: Daniel Henrique Barboza Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_vector.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index 530e49c588d6..d92d1348045c 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -116,6 +116,9 @@ static int kvm_riscv_vcpu_vreg_addr(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_VECTOR_CSR_REG(vcsr): *reg_addr = &cntx->vector.vcsr; break; + case KVM_REG_RISCV_VECTOR_CSR_REG(vlenb): + *reg_addr = &cntx->vector.vlenb; + break; case KVM_REG_RISCV_VECTOR_CSR_REG(datap): default: return -ENOENT; @@ -174,6 +177,18 @@ int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu, if (!riscv_isa_extension_available(isa, v)) return -ENOENT; + if (reg_num == KVM_REG_RISCV_VECTOR_CSR_REG(vlenb)) { + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long reg_val; + + if (copy_from_user(®_val, uaddr, reg_size)) + return -EFAULT; + if (reg_val != cntx->vector.vlenb) + return -EINVAL; + + return 0; + } + rc = kvm_riscv_vcpu_vreg_addr(vcpu, reg_num, reg_size, ®_addr); if (rc) return rc; -- cgit v1.2.3 From 3975525e554559117bbf569239c8b41f2c2fa5cf Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 5 Dec 2023 14:45:09 -0300 Subject: RISC-V: KVM: add vector registers and CSRs in KVM_GET_REG_LIST Add all vector registers and CSRs (vstart, vl, vtype, vcsr, vlenb) in get-reg-list. Signed-off-by: Daniel Henrique Barboza Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 48262be73aa0..11cdbf844291 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -961,6 +961,55 @@ static unsigned long num_sbi_ext_regs(struct kvm_vcpu *vcpu) return copy_sbi_ext_reg_indices(vcpu, NULL); } +static inline unsigned long num_vector_regs(const struct kvm_vcpu *vcpu) +{ + if (!riscv_isa_extension_available(vcpu->arch.isa, v)) + return 0; + + /* vstart, vl, vtype, vcsr, vlenb and 32 vector regs */ + return 37; +} + +static int copy_vector_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + int n = num_vector_regs(vcpu); + u64 reg, size; + int i; + + if (n == 0) + return 0; + + /* copy vstart, vl, vtype, vcsr and vlenb */ + size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + for (i = 0; i < 5; i++) { + reg = KVM_REG_RISCV | size | KVM_REG_RISCV_VECTOR | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + /* vector_regs have a variable 'vlenb' size */ + size = __builtin_ctzl(cntx->vector.vlenb); + size <<= KVM_REG_SIZE_SHIFT; + for (i = 0; i < 32; i++) { + reg = KVM_REG_RISCV | KVM_REG_RISCV_VECTOR | size | + KVM_REG_RISCV_VECTOR_REG(i); + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + /* * kvm_riscv_vcpu_num_regs - how many registers do we present via KVM_GET/SET_ONE_REG * @@ -976,6 +1025,7 @@ unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu) res += num_timer_regs(); res += num_fp_f_regs(vcpu); res += num_fp_d_regs(vcpu); + res += num_vector_regs(vcpu); res += num_isa_ext_regs(vcpu); res += num_sbi_ext_regs(vcpu); @@ -1020,6 +1070,11 @@ int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, return ret; uindices += ret; + ret = copy_vector_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + ret = copy_isa_ext_reg_indices(vcpu, uindices); if (ret < 0) return ret; -- cgit v1.2.3 From 4c460eb369514d53383a7c6ba1aefbca4914c68b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 24 Dec 2023 14:04:02 +0530 Subject: RISC-V: KVM: Fix indentation in kvm_riscv_vcpu_set_reg_csr() The indentation of "break" in kvm_riscv_vcpu_set_reg_csr() is inconsistent hence let us fix it. Fixes: c04913f2b54e ("RISCV: KVM: Add sstateen0 to ONE_REG") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312190719.kBuYl6oJ-lkp@intel.com/ Signed-off-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 11cdbf844291..c0bad1aed9f0 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -485,7 +485,7 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) rc = kvm_riscv_vcpu_smstateen_set_csr(vcpu, reg_num, reg_val); -break; + break; default: rc = -ENOENT; break; -- cgit v1.2.3 From 323925ed6dbb0ed877047b28fae4152527cc63db Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:14 +0100 Subject: RISC-V: paravirt: Add skeleton for pv-time support Add the files and functions needed to support paravirt time on RISC-V. Also include the common code needed for the first application of pv-time, which is steal-time. In the next patches we'll complete the functions to fully enable steal-time support. Acked-by: Palmer Dabbelt Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- Documentation/admin-guide/kernel-parameters.txt | 6 +- arch/riscv/include/asm/paravirt.h | 28 +++++++++ arch/riscv/include/asm/paravirt_api_clock.h | 1 + arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/paravirt.c | 79 +++++++++++++++++++++++++ arch/riscv/kernel/time.c | 3 + 6 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/include/asm/paravirt.h create mode 100644 arch/riscv/include/asm/paravirt_api_clock.h create mode 100644 arch/riscv/kernel/paravirt.c (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3f..a0d9259e4857 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3985,9 +3985,9 @@ vulnerability. System may allow data leaks with this option. - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized - steal time accounting. steal time is computed, but - won't influence scheduler behaviour + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable + paravirtualized steal time accounting. steal time is + computed, but won't influence scheduler behaviour nosync [HW,M68K] Disables sync negotiation for all devices. diff --git a/arch/riscv/include/asm/paravirt.h b/arch/riscv/include/asm/paravirt.h new file mode 100644 index 000000000000..c0abde70fc2c --- /dev/null +++ b/arch/riscv/include/asm/paravirt.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_PARAVIRT_H +#define _ASM_RISCV_PARAVIRT_H + +#ifdef CONFIG_PARAVIRT +#include + +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; + +u64 dummy_steal_clock(int cpu); + +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); + +static inline u64 paravirt_steal_clock(int cpu) +{ + return static_call(pv_steal_clock)(cpu); +} + +int __init pv_time_init(void); + +#else + +#define pv_time_init() do {} while (0) + +#endif /* CONFIG_PARAVIRT */ +#endif /* _ASM_RISCV_PARAVIRT_H */ diff --git a/arch/riscv/include/asm/paravirt_api_clock.h b/arch/riscv/include/asm/paravirt_api_clock.h new file mode 100644 index 000000000000..65ac7cee0dad --- /dev/null +++ b/arch/riscv/include/asm/paravirt_api_clock.h @@ -0,0 +1 @@ +#include diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index fee22a3d1b53..807c2bde1f83 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_SMP) += sbi-ipi.o obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c new file mode 100644 index 000000000000..48d802df7739 --- /dev/null +++ b/arch/riscv/kernel/paravirt.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "riscv-pv: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +static bool __init has_pv_steal_clock(void) +{ + return false; +} + +static int pv_time_cpu_online(unsigned int cpu) +{ + return 0; +} + +static int pv_time_cpu_down_prepare(unsigned int cpu) +{ + return 0; +} + +static u64 pv_time_steal_clock(int cpu) +{ + return 0; +} + +int __init pv_time_init(void) +{ + int ret; + + if (!has_pv_steal_clock()) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "riscv/pv_time:online", + pv_time_cpu_online, + pv_time_cpu_down_prepare); + if (ret < 0) + return ret; + + static_call_update(pv_steal_clock, pv_time_steal_clock); + + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + + pr_info("Computing paravirt steal-time\n"); + + return 0; +} diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 23641e82a9df..ba3477197789 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -12,6 +12,7 @@ #include #include #include +#include unsigned long riscv_timebase __ro_after_init; EXPORT_SYMBOL_GPL(riscv_timebase); @@ -45,4 +46,6 @@ void __init time_init(void) timer_probe(); tick_setup_hrtimer_broadcast(); + + pv_time_init(); } -- cgit v1.2.3 From 6cfc624576a64145b1d6d3d48de7161a7505f403 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:15 +0100 Subject: RISC-V: Add SBI STA extension definitions The SBI STA extension enables steal-time accounting. Add the definitions it specifies. Acked-by: Palmer Dabbelt Reviewed-by: Conor Dooley Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/sbi.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 0892f4421bc4..b6f898c56940 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -31,6 +31,7 @@ enum sbi_ext_id { SBI_EXT_SRST = 0x53525354, SBI_EXT_PMU = 0x504D55, SBI_EXT_DBCN = 0x4442434E, + SBI_EXT_STA = 0x535441, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -243,6 +244,22 @@ enum sbi_ext_dbcn_fid { SBI_EXT_DBCN_CONSOLE_WRITE_BYTE = 2, }; +/* SBI STA (steal-time accounting) extension */ +enum sbi_ext_sta_fid { + SBI_EXT_STA_STEAL_TIME_SET_SHMEM = 0, +}; + +struct sbi_sta_struct { + __le32 sequence; + __le32 flags; + __le64 steal; + u8 preempted; + u8 pad[47]; +} __packed; + +#define SBI_STA_SHMEM_DISABLE -1 + +/* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 #define SBI_SPEC_VERSION_MAJOR_MASK 0x7f -- cgit v1.2.3 From fdf68acccfc6af9497c34ee411d89af13b6516ed Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:16 +0100 Subject: RISC-V: paravirt: Implement steal-time support When the SBI STA extension exists we can use it to implement paravirt steal-time support. Fill in the empty pv-time functions with an SBI STA implementation and add the Kconfig knobs allowing it to be enabled. Acked-by: Palmer Dabbelt Reviewed-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/Kconfig | 19 ++++++++++++++ arch/riscv/kernel/paravirt.c | 62 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 24c1799e2ec4..ef8b7b012a0b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -724,6 +724,25 @@ config COMPAT If you want to execute 32-bit userspace applications, say Y. +config PARAVIRT + bool "Enable paravirtualization code" + depends on RISCV_SBI + help + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. + +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + depends on PARAVIRT + help + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + config RELOCATABLE bool "Build a relocatable kernel" depends on MMU && 64BIT && !XIP_KERNEL diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 48d802df7739..8e114f5930ce 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -6,13 +6,21 @@ #define pr_fmt(fmt) "riscv-pv: " fmt #include +#include +#include #include #include +#include +#include +#include #include #include #include +#include +#include #include +#include struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -33,24 +41,72 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); + static bool __init has_pv_steal_clock(void) { + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_STA) > 0) { + pr_info("SBI STA extension detected\n"); + return true; + } + return false; } -static int pv_time_cpu_online(unsigned int cpu) +static int sbi_sta_steal_time_set_shmem(unsigned long lo, unsigned long hi, + unsigned long flags) { + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_STA, SBI_EXT_STA_STEAL_TIME_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + if (ret.error) { + if (lo == SBI_STA_SHMEM_DISABLE && hi == SBI_STA_SHMEM_DISABLE) + pr_warn("Failed to disable steal-time shmem"); + else + pr_warn("Failed to set steal-time shmem"); + return sbi_err_map_linux_errno(ret.error); + } + return 0; } +static int pv_time_cpu_online(unsigned int cpu) +{ + struct sbi_sta_struct *st = this_cpu_ptr(&steal_time); + phys_addr_t pa = __pa(st); + unsigned long lo = (unsigned long)pa; + unsigned long hi = IS_ENABLED(CONFIG_32BIT) ? upper_32_bits((u64)pa) : 0; + + return sbi_sta_steal_time_set_shmem(lo, hi, 0); +} + static int pv_time_cpu_down_prepare(unsigned int cpu) { - return 0; + return sbi_sta_steal_time_set_shmem(SBI_STA_SHMEM_DISABLE, + SBI_STA_SHMEM_DISABLE, 0); } static u64 pv_time_steal_clock(int cpu) { - return 0; + struct sbi_sta_struct *st = per_cpu_ptr(&steal_time, cpu); + u32 sequence; + u64 steal; + + /* + * Check the sequence field before and after reading the steal + * field. Repeat the read if it is different or odd. + */ + do { + sequence = READ_ONCE(st->sequence); + virt_rmb(); + steal = READ_ONCE(st->steal); + virt_rmb(); + } while ((le32_to_cpu(sequence) & 1) || + sequence != READ_ONCE(st->sequence)); + + return le64_to_cpu(steal); } int __init pv_time_init(void) -- cgit v1.2.3 From 5fed84a800e6048656c17be6e921787db2b5c6c0 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:17 +0100 Subject: RISC-V: KVM: Add SBI STA extension skeleton Add the files and functions needed to support the SBI STA (steal-time accounting) extension. In the next patches we'll complete the functions to fully enable SBI STA support. Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 1 + arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/vcpu_sbi.c | 4 +++ arch/riscv/kvm/vcpu_sbi_sta.c | 47 +++++++++++++++++++++++++++++++++++ 5 files changed, 54 insertions(+) create mode 100644 arch/riscv/kvm/vcpu_sbi_sta.c (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index bffda0ac59b6..99c23bb37a37 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -76,6 +76,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 60d3b21dead7..e961d79622fb 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -157,6 +157,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_EXPERIMENTAL, KVM_RISCV_SBI_EXT_VENDOR, KVM_RISCV_SBI_EXT_DBCN, + KVM_RISCV_SBI_EXT_STA, KVM_RISCV_SBI_EXT_MAX, }; diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 4c2067fc59fc..c9646521f113 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -26,6 +26,7 @@ kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o kvm-y += vcpu_sbi_base.o kvm-y += vcpu_sbi_replace.o kvm-y += vcpu_sbi_hsm.o +kvm-y += vcpu_sbi_sta.o kvm-y += vcpu_timer.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o kvm-y += aia.o diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index dcdff4458190..088daaa23dd8 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -70,6 +70,10 @@ static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { .ext_idx = KVM_RISCV_SBI_EXT_DBCN, .ext_ptr = &vcpu_sbi_ext_dbcn, }, + { + .ext_idx = KVM_RISCV_SBI_EXT_STA, + .ext_ptr = &vcpu_sbi_ext_sta, + }, { .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, .ext_ptr = &vcpu_sbi_ext_experimental, diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c new file mode 100644 index 000000000000..839911dcd837 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#include + +#include +#include + +static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) +{ + return SBI_ERR_FAILURE; +} + +static int kvm_sbi_ext_sta_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long funcid = cp->a6; + int ret; + + switch (funcid) { + case SBI_EXT_STA_STEAL_TIME_SET_SHMEM: + ret = kvm_sbi_sta_steal_time_set_shmem(vcpu); + break; + default: + ret = SBI_ERR_NOT_SUPPORTED; + break; + } + + retdata->err_val = ret; + + return 0; +} + +static unsigned long kvm_sbi_ext_sta_probe(struct kvm_vcpu *vcpu) +{ + return 0; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = { + .extid_start = SBI_EXT_STA, + .extid_end = SBI_EXT_STA, + .handler = kvm_sbi_ext_sta_handler, + .probe = kvm_sbi_ext_sta_probe, +}; -- cgit v1.2.3 From 2a1f6bf079700f0f9d8045ab77b302aeb4d12c06 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:18 +0100 Subject: RISC-V: KVM: Add steal-update vcpu request Add a new vcpu request to inform a vcpu that it should record its steal-time information. The request is made each time it has been detected that the vcpu task was not assigned a cpu for some time, which is easy to do by making the request from vcpu-load. The record function is just a stub for now and will be filled in with the rest of the steal-time support functions in following patches. Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 3 +++ arch/riscv/kvm/vcpu.c | 5 +++++ arch/riscv/kvm/vcpu_sbi_sta.c | 4 ++++ 3 files changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 0eefd9c991ae..230b82c3118d 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -41,6 +41,7 @@ KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HFENCE \ KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6) enum kvm_riscv_hfence_type { KVM_RISCV_HFENCE_UNKNOWN = 0, @@ -372,4 +373,6 @@ bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask); void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu); + #endif /* __RISCV_KVM_HOST_H__ */ diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bf3952d1a621..6995b8b641e4 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -541,6 +541,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_riscv_vcpu_aia_load(vcpu, cpu); + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + vcpu->cpu = cpu; } @@ -614,6 +616,9 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_HFENCE, vcpu)) kvm_riscv_hfence_process(vcpu); + + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + kvm_riscv_vcpu_record_steal_time(vcpu); } } diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 839911dcd837..e28351c9488b 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -8,6 +8,10 @@ #include #include +void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) +{ +} + static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) { return SBI_ERR_FAILURE; -- cgit v1.2.3 From 38b3390ee4880140b6245fe3273fe9ce53f65bde Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:19 +0100 Subject: RISC-V: KVM: Add SBI STA info to vcpu_arch KVM's implementation of SBI STA needs to track the address of each VCPU's steal-time shared memory region as well as the amount of stolen time. Add a structure to vcpu_arch to contain this state and make sure that the address is always set to INVALID_GPA on vcpu reset. And, of course, ensure KVM won't try to update steal- time when the shared memory address is invalid. Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 7 +++++++ arch/riscv/kvm/vcpu.c | 2 ++ arch/riscv/kvm/vcpu_sbi_sta.c | 10 ++++++++++ 3 files changed, 19 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 230b82c3118d..525cba63e0c5 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -263,6 +263,12 @@ struct kvm_vcpu_arch { /* 'static' configurations which are set only once */ struct kvm_vcpu_config cfg; + + /* SBI steal-time accounting */ + struct { + gpa_t shmem; + u64 last_steal; + } sta; }; static inline void kvm_arch_sync_events(struct kvm *kvm) {} @@ -373,6 +379,7 @@ bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask); void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu); #endif /* __RISCV_KVM_HOST_H__ */ diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 6995b8b641e4..b5ca9f2e98ac 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -83,6 +83,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.hfence_tail = 0; memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue)); + kvm_riscv_vcpu_sbi_sta_reset(vcpu); + /* Reset the guest CSRs for hotplug usecase */ if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index e28351c9488b..6592d287fc4e 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -8,8 +8,18 @@ #include #include +void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu) +{ + vcpu->arch.sta.shmem = INVALID_GPA; + vcpu->arch.sta.last_steal = 0; +} + void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) { + gpa_t shmem = vcpu->arch.sta.shmem; + + if (shmem == INVALID_GPA) + return; } static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 5b9e41321ba919dd051c68d2a1d2c753aa61634c Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:20 +0100 Subject: RISC-V: KVM: Add support for SBI extension registers Some SBI extensions have state that needs to be saved / restored when migrating the VM. Provide a get/set-one-reg register type for SBI extension registers. Each SBI extension that uses this type will have its own subtype. There are currently no subtypes defined. The next patch introduces the first one. Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 4 +++ arch/riscv/include/uapi/asm/kvm.h | 3 ++ arch/riscv/kvm/vcpu_onereg.c | 42 ++++++++++++++++++++++--- arch/riscv/kvm/vcpu_sbi.c | 58 +++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 99c23bb37a37..dd60f73b5c36 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -60,6 +60,10 @@ int kvm_riscv_vcpu_set_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); +int kvm_riscv_vcpu_set_reg_sbi(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg); +int kvm_riscv_vcpu_get_reg_sbi(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg); const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( struct kvm_vcpu *vcpu, unsigned long extid); bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index e961d79622fb..3471b1e48d18 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -242,6 +242,9 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_VECTOR_REG(n) \ ((n) + sizeof(struct __riscv_v_ext_state) / sizeof(unsigned long)) +/* Registers for specific SBI extensions are mapped as type 10 */ +#define KVM_REG_RISCV_SBI_STATE (0x0a << KVM_REG_RISCV_TYPE_SHIFT) + /* Device Control API: RISC-V AIA */ #define KVM_DEV_RISCV_APLIC_ALIGN 0x1000 #define KVM_DEV_RISCV_APLIC_SIZE 0x4000 diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index c0bad1aed9f0..143d0edd7f63 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -961,6 +961,29 @@ static unsigned long num_sbi_ext_regs(struct kvm_vcpu *vcpu) return copy_sbi_ext_reg_indices(vcpu, NULL); } +static inline unsigned long num_sbi_regs(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static int copy_sbi_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + int n = num_sbi_regs(vcpu); + + for (int i = 0; i < n; i++) { + u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 | + KVM_REG_RISCV_SBI_STATE | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + + return n; +} + static inline unsigned long num_vector_regs(const struct kvm_vcpu *vcpu) { if (!riscv_isa_extension_available(vcpu->arch.isa, v)) @@ -1028,6 +1051,7 @@ unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu) res += num_vector_regs(vcpu); res += num_isa_ext_regs(vcpu); res += num_sbi_ext_regs(vcpu); + res += num_sbi_regs(vcpu); return res; } @@ -1083,6 +1107,12 @@ int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, ret = copy_sbi_ext_reg_indices(vcpu, uindices); if (ret < 0) return ret; + uindices += ret; + + ret = copy_sbi_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; return 0; } @@ -1105,12 +1135,14 @@ int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_FP_D: return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + case KVM_REG_RISCV_VECTOR: + return kvm_riscv_vcpu_set_reg_vector(vcpu, reg); case KVM_REG_RISCV_ISA_EXT: return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg); case KVM_REG_RISCV_SBI_EXT: return kvm_riscv_vcpu_set_reg_sbi_ext(vcpu, reg); - case KVM_REG_RISCV_VECTOR: - return kvm_riscv_vcpu_set_reg_vector(vcpu, reg); + case KVM_REG_RISCV_SBI_STATE: + return kvm_riscv_vcpu_set_reg_sbi(vcpu, reg); default: break; } @@ -1136,12 +1168,14 @@ int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_FP_D: return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + case KVM_REG_RISCV_VECTOR: + return kvm_riscv_vcpu_get_reg_vector(vcpu, reg); case KVM_REG_RISCV_ISA_EXT: return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg); case KVM_REG_RISCV_SBI_EXT: return kvm_riscv_vcpu_get_reg_sbi_ext(vcpu, reg); - case KVM_REG_RISCV_VECTOR: - return kvm_riscv_vcpu_get_reg_vector(vcpu, reg); + case KVM_REG_RISCV_SBI_STATE: + return kvm_riscv_vcpu_get_reg_sbi(vcpu, reg); default: break; } diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 088daaa23dd8..a1997c39dfde 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -325,6 +325,64 @@ int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, return 0; } +int kvm_riscv_vcpu_set_reg_sbi(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_STATE); + unsigned long reg_subtype, reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + default: + return -EINVAL; + } + + return 0; +} + +int kvm_riscv_vcpu_get_reg_sbi(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_SBI_STATE); + unsigned long reg_subtype, reg_val; + int ret; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK; + reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + default: + return -EINVAL; + } + + if (ret) + return ret; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( struct kvm_vcpu *vcpu, unsigned long extid) { -- cgit v1.2.3 From f61ce890b1f0742f17b3a5d1f8c72574a33ffeb2 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:21 +0100 Subject: RISC-V: KVM: Add support for SBI STA registers KVM userspace needs to be able to save and restore the steal-time shared memory address. Provide the address through the get/set-one-reg interface with two ulong-sized SBI STA extension registers (lo and hi). 64-bit KVM userspace must not set the hi register to anything other than zero and is allowed to completely neglect saving/restoring it. Reviewed-by: Anup Patel Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 5 ++++ arch/riscv/include/uapi/asm/kvm.h | 9 ++++++ arch/riscv/kvm/vcpu_onereg.c | 37 ++++++++++++++--------- arch/riscv/kvm/vcpu_sbi.c | 5 ++++ arch/riscv/kvm/vcpu_sbi_sta.c | 55 +++++++++++++++++++++++++++++++++++ 5 files changed, 97 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index dd60f73b5c36..b96705258cf9 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -70,6 +70,11 @@ bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx); int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long *reg_val); +int kvm_riscv_vcpu_set_reg_sbi_sta(struct kvm_vcpu *vcpu, unsigned long reg_num, + unsigned long reg_val); + #ifdef CONFIG_RISCV_SBI_V01 extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01; #endif diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 3471b1e48d18..d6b7a5b95874 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -161,6 +161,12 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_MAX, }; +/* SBI STA extension registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_sbi_sta { + unsigned long shmem_lo; + unsigned long shmem_hi; +}; + /* Possible states for kvm_riscv_timer */ #define KVM_RISCV_TIMER_STATE_OFF 0 #define KVM_RISCV_TIMER_STATE_ON 1 @@ -244,6 +250,9 @@ enum KVM_RISCV_SBI_EXT_ID { /* Registers for specific SBI extensions are mapped as type 10 */ #define KVM_REG_RISCV_SBI_STATE (0x0a << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_SBI_STA (0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT) +#define KVM_REG_RISCV_SBI_STA_REG(name) \ + (offsetof(struct kvm_riscv_sbi_sta, name) / sizeof(unsigned long)) /* Device Control API: RISC-V AIA */ #define KVM_DEV_RISCV_APLIC_ALIGN 0x1000 diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 143d0edd7f63..fc34557f5356 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -961,27 +961,36 @@ static unsigned long num_sbi_ext_regs(struct kvm_vcpu *vcpu) return copy_sbi_ext_reg_indices(vcpu, NULL); } -static inline unsigned long num_sbi_regs(struct kvm_vcpu *vcpu) -{ - return 0; -} - static int copy_sbi_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - int n = num_sbi_regs(vcpu); + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + int total = 0; - for (int i = 0; i < n; i++) { - u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 | - KVM_REG_RISCV_SBI_STATE | i; + if (scontext->ext_status[KVM_RISCV_SBI_EXT_STA] == KVM_RISCV_SBI_EXT_STATUS_ENABLED) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + int n = sizeof(struct kvm_riscv_sbi_sta) / sizeof(unsigned long); - if (uindices) { - if (put_user(reg, uindices)) - return -EFAULT; - uindices++; + for (int i = 0; i < n; i++) { + u64 reg = KVM_REG_RISCV | size | + KVM_REG_RISCV_SBI_STATE | + KVM_REG_RISCV_SBI_STA | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } } + + total += n; } - return n; + return total; +} + +static inline unsigned long num_sbi_regs(struct kvm_vcpu *vcpu) +{ + return copy_sbi_reg_indices(vcpu, NULL); } static inline unsigned long num_vector_regs(const struct kvm_vcpu *vcpu) diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index a1997c39dfde..72a2ffb8dcd1 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -345,6 +345,8 @@ int kvm_riscv_vcpu_set_reg_sbi(struct kvm_vcpu *vcpu, reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; switch (reg_subtype) { + case KVM_REG_RISCV_SBI_STA: + return kvm_riscv_vcpu_set_reg_sbi_sta(vcpu, reg_num, reg_val); default: return -EINVAL; } @@ -370,6 +372,9 @@ int kvm_riscv_vcpu_get_reg_sbi(struct kvm_vcpu *vcpu, reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK; switch (reg_subtype) { + case KVM_REG_RISCV_SBI_STA: + ret = kvm_riscv_vcpu_get_reg_sbi_sta(vcpu, reg_num, ®_val); + break; default: return -EINVAL; } diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 6592d287fc4e..87bf1a5f05ce 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -3,6 +3,8 @@ * Copyright (c) 2023 Ventana Micro Systems Inc. */ +#include +#include #include #include @@ -59,3 +61,56 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = { .handler = kvm_sbi_ext_sta_handler, .probe = kvm_sbi_ext_sta_probe, }; + +int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *reg_val) +{ + switch (reg_num) { + case KVM_REG_RISCV_SBI_STA_REG(shmem_lo): + *reg_val = (unsigned long)vcpu->arch.sta.shmem; + break; + case KVM_REG_RISCV_SBI_STA_REG(shmem_hi): + if (IS_ENABLED(CONFIG_32BIT)) + *reg_val = upper_32_bits(vcpu->arch.sta.shmem); + else + *reg_val = 0; + break; + default: + return -EINVAL; + } + + return 0; +} + +int kvm_riscv_vcpu_set_reg_sbi_sta(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + switch (reg_num) { + case KVM_REG_RISCV_SBI_STA_REG(shmem_lo): + if (IS_ENABLED(CONFIG_32BIT)) { + gpa_t hi = upper_32_bits(vcpu->arch.sta.shmem); + + vcpu->arch.sta.shmem = reg_val; + vcpu->arch.sta.shmem |= hi << 32; + } else { + vcpu->arch.sta.shmem = reg_val; + } + break; + case KVM_REG_RISCV_SBI_STA_REG(shmem_hi): + if (IS_ENABLED(CONFIG_32BIT)) { + gpa_t lo = lower_32_bits(vcpu->arch.sta.shmem); + + vcpu->arch.sta.shmem = ((gpa_t)reg_val << 32); + vcpu->arch.sta.shmem |= lo; + } else if (reg_val != 0) { + return -EINVAL; + } + break; + default: + return -EINVAL; + } + + return 0; +} -- cgit v1.2.3 From e9f12b5fff8ad0eefd0340273767d329ef65fd69 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 20 Dec 2023 17:00:22 +0100 Subject: RISC-V: KVM: Implement SBI STA extension Add a select SCHED_INFO to the KVM config in order to get run_delay info. Then implement SBI STA's set-steal-time-shmem function and kvm_riscv_vcpu_record_steal_time() to provide the steal-time info to guests. Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/kvm/Kconfig | 1 + arch/riscv/kvm/vcpu_sbi_sta.c | 96 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index dfc237d7875b..148e52b516cf 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -32,6 +32,7 @@ config KVM select KVM_XFER_TO_GUEST_WORK select MMU_NOTIFIER select PREEMPT_NOTIFIERS + select SCHED_INFO help Support hosting virtualized guest machines. diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 87bf1a5f05ce..01f09fe8c3b0 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -6,9 +6,15 @@ #include #include #include +#include +#include +#include +#include #include +#include #include +#include void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu) { @@ -19,14 +25,100 @@ void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu) void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) { gpa_t shmem = vcpu->arch.sta.shmem; + u64 last_steal = vcpu->arch.sta.last_steal; + u32 *sequence_ptr, sequence; + u64 *steal_ptr, steal; + unsigned long hva; + gfn_t gfn; if (shmem == INVALID_GPA) return; + + /* + * shmem is 64-byte aligned (see the enforcement in + * kvm_sbi_sta_steal_time_set_shmem()) and the size of sbi_sta_struct + * is 64 bytes, so we know all its offsets are in the same page. + */ + gfn = shmem >> PAGE_SHIFT; + hva = kvm_vcpu_gfn_to_hva(vcpu, gfn); + + if (WARN_ON(kvm_is_error_hva(hva))) { + vcpu->arch.sta.shmem = INVALID_GPA; + return; + } + + sequence_ptr = (u32 *)(hva + offset_in_page(shmem) + + offsetof(struct sbi_sta_struct, sequence)); + steal_ptr = (u64 *)(hva + offset_in_page(shmem) + + offsetof(struct sbi_sta_struct, steal)); + + if (WARN_ON(get_user(sequence, sequence_ptr))) + return; + + sequence = le32_to_cpu(sequence); + sequence += 1; + + if (WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr))) + return; + + if (!WARN_ON(get_user(steal, steal_ptr))) { + steal = le64_to_cpu(steal); + vcpu->arch.sta.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.sta.last_steal - last_steal; + WARN_ON(put_user(cpu_to_le64(steal), steal_ptr)); + } + + sequence += 1; + WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr)); + + kvm_vcpu_mark_page_dirty(vcpu, gfn); } static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) { - return SBI_ERR_FAILURE; + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long shmem_phys_lo = cp->a0; + unsigned long shmem_phys_hi = cp->a1; + u32 flags = cp->a2; + struct sbi_sta_struct zero_sta = {0}; + unsigned long hva; + bool writable; + gpa_t shmem; + int ret; + + if (flags != 0) + return SBI_ERR_INVALID_PARAM; + + if (shmem_phys_lo == SBI_STA_SHMEM_DISABLE && + shmem_phys_hi == SBI_STA_SHMEM_DISABLE) { + vcpu->arch.sta.shmem = INVALID_GPA; + return 0; + } + + if (shmem_phys_lo & (SZ_64 - 1)) + return SBI_ERR_INVALID_PARAM; + + shmem = shmem_phys_lo; + + if (shmem_phys_hi != 0) { + if (IS_ENABLED(CONFIG_32BIT)) + shmem |= ((gpa_t)shmem_phys_hi << 32); + else + return SBI_ERR_INVALID_ADDRESS; + } + + hva = kvm_vcpu_gfn_to_hva_prot(vcpu, shmem >> PAGE_SHIFT, &writable); + if (kvm_is_error_hva(hva) || !writable) + return SBI_ERR_INVALID_ADDRESS; + + ret = kvm_vcpu_write_guest(vcpu, shmem, &zero_sta, sizeof(zero_sta)); + if (ret) + return SBI_ERR_FAILURE; + + vcpu->arch.sta.shmem = shmem; + vcpu->arch.sta.last_steal = current->sched_info.run_delay; + + return 0; } static int kvm_sbi_ext_sta_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, @@ -52,7 +144,7 @@ static int kvm_sbi_ext_sta_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, static unsigned long kvm_sbi_ext_sta_probe(struct kvm_vcpu *vcpu) { - return 0; + return !!sched_info_on(); } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = { -- cgit v1.2.3 From ad362fe07fecf0aba839ff2cc59a3617bd42c33f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 4 Jan 2024 18:32:32 +0000 Subject: KVM: arm64: vgic-its: Avoid potential UAF in LPI translation cache There is a potential UAF scenario in the case of an LPI translation cache hit racing with an operation that invalidates the cache, such as a DISCARD ITS command. The root of the problem is that vgic_its_check_cache() does not elevate the refcount on the vgic_irq before dropping the lock that serializes refcount changes. Have vgic_its_check_cache() raise the refcount on the returned vgic_irq and add the corresponding decrement after queueing the interrupt. Cc: stable@vger.kernel.org Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240104183233.3560639-1-oliver.upton@linux.dev --- arch/arm64/kvm/vgic/vgic-its.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 2dad2d095160..e2764d0ffa9f 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -590,7 +590,11 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, unsigned long flags; raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + irq = __vgic_its_check_cache(dist, db, devid, eventid); + if (irq) + vgic_get_irq_kref(irq); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); return irq; @@ -769,6 +773,7 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); return 0; } -- cgit v1.2.3 From 040113fa32f27096f531c377001936e0d7964597 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 4 Jan 2024 16:42:20 +0000 Subject: KVM: arm64: Add missing memory barriers when switching to pKVM's hyp pgd In commit f320bc742bc23 ("KVM: arm64: Prepare the creation of s1 mappings at EL2"), pKVM switches from a temporary host-provided page-table to its own page-table at EL2. Since there is only a single TTBR for the nVHE hypervisor, this involves disabling and re-enabling the MMU in __pkvm_init_switch_pgd(). Unfortunately, the memory barriers here are not quite correct. Specifically: - A DSB is required to complete the TLB invalidation executed while the MMU is disabled. - An ISB is required to make the new TTBR value visible to the page-table walker before the MMU is enabled in the SCTLR. An earlier version of the patch actually got this correct: https://lore.kernel.org/lkml/20210304184717.GB21795@willie-the-truck/ but thanks to some badly worded review comments from yours truly, these were dropped for the version that was eventually merged. Bring back the barriers and fix the potential issue (but note that this was found by code inspection). Cc: Quentin Perret Fixes: f320bc742bc23 ("KVM: arm64: Prepare the creation of s1 mappings at EL2") Signed-off-by: Will Deacon Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240104164220.7968-1-will@kernel.org --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index f62a7d360285..2994878d68ea 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -288,6 +288,8 @@ alternative_else_nop_endif mov sp, x0 /* And turn the MMU back on! */ + dsb nsh + isb set_sctlr_el2 x2 ret x1 SYM_FUNC_END(__pkvm_init_switch_pgd) -- cgit v1.2.3 From caadf876bb7449bf25ef817afe7fb881df8198a2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jan 2024 11:15:07 -0500 Subject: KVM: introduce CONFIG_KVM_COMMON CONFIG_HAVE_KVM is currently used by some architectures to either enabled the KVM config proper, or to enable host-side code that is not part of the KVM module. However, CONFIG_KVM's "select" statement in virt/kvm/Kconfig corresponds to a third meaning, namely to enable common Kconfigs required by all architectures that support KVM. These three meanings can be replaced respectively by an architecture-specific Kconfig, by IS_ENABLED(CONFIG_KVM), or by a new Kconfig symbol that is in turn selected by the architecture-specific "config KVM". Start by introducing such a new Kconfig symbol, CONFIG_KVM_COMMON. Unlike CONFIG_HAVE_KVM, it is selected by CONFIG_KVM, not by architecture code, and it brings in all dependencies of common KVM code. In particular, INTERVAL_TREE was missing in loongarch and riscv, so that is another thing that is fixed. Fixes: 8132d887a702 ("KVM: remove CONFIG_HAVE_KVM_EVENTFD", 2023-12-08) Reported-by: Randy Dunlap Closes: https://lore.kernel.org/all/44907c6b-c5bd-4e4a-a921-e4d3825539d8@infradead.org/ Reviewed-by: Andrew Jones Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 3 +-- arch/loongarch/kvm/Kconfig | 2 +- arch/mips/kvm/Kconfig | 3 +-- arch/powerpc/kvm/Kconfig | 3 +-- arch/riscv/kvm/Kconfig | 2 +- arch/s390/kvm/Kconfig | 3 +-- arch/x86/kvm/Kconfig | 3 +-- virt/kvm/Kconfig | 5 +++++ 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index b07c60c9737d..6c3c8ca73e7f 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,9 +21,9 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER - select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -39,7 +39,6 @@ menuconfig KVM select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS - select INTERVAL_TREE select XARRAY_MULTI help Support hosting virtualized guest machines. diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index daba4cd5e87d..61f7e33b1f95 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -23,12 +23,12 @@ config KVM depends on HAVE_KVM select HAVE_KVM_DIRTY_RING_ACQ_REL select HAVE_KVM_VCPU_ASYNC_IOCTL + select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO select KVM_XFER_TO_GUEST_WORK - select PREEMPT_NOTIFIERS help Support hosting virtualized guest machines using hardware virtualization extensions. You will need diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 428141b0b48f..18e7a17d5115 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -20,12 +20,11 @@ config KVM depends on HAVE_KVM depends on MIPS_FP_SUPPORT select EXPORT_UASM - select PREEMPT_NOTIFIERS + select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER - select INTERVAL_TREE select KVM_GENERIC_HARDWARE_ENABLING help Support for hosting Guest kernels. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b47196085a42..074263429faf 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -19,12 +19,11 @@ if VIRTUALIZATION config KVM bool - select PREEMPT_NOTIFIERS + select KVM_COMMON select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS - select INTERVAL_TREE config KVM_BOOK3S_HANDLER bool diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 1fd76aee3b71..d490db943858 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -24,12 +24,12 @@ config KVM select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI select HAVE_KVM_VCPU_ASYNC_IOCTL + select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER - select PREEMPT_NOTIFIERS select SCHED_INFO help Support hosting virtualized guest machines. diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index bb6d90351119..72e9b7dcdf7d 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,17 +20,16 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM - select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC + select KVM_COMMON select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL select KVM_VFIO - select INTERVAL_TREE select MMU_NOTIFIER help Support hosting paravirtualized guest machines using the SIE diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b07247b0b958..cce3dea27920 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,7 +23,7 @@ config KVM depends on HAVE_KVM depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC - select PREEMPT_NOTIFIERS + select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE @@ -44,7 +44,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO - select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING help diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 6793211a0b64..ace72be98fb2 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -3,7 +3,12 @@ config HAVE_KVM bool + +config KVM_COMMON + bool select EVENTFD + select INTERVAL_TREE + select PREEMPT_NOTIFIERS config HAVE_KVM_PFNCACHE bool -- cgit v1.2.3 From 783288010035e4c250a0b6491a4642cdb8d30548 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Jan 2024 07:51:26 -0500 Subject: KVM: x86: add missing "depends on KVM" Support for KVM software-protected VMs should not be configurable, if KVM is not available at all. Fixes: 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index cce3dea27920..10c56603cc06 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -77,7 +77,7 @@ config KVM_WERROR config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT - depends on X86_64 + depends on KVM && X86_64 select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently "protected" -- cgit v1.2.3 From 1c6d984f523f67ecfad1083bb04c55d91977bb15 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 5 Dec 2023 03:45:01 +0300 Subject: x86/kvm: Do not try to disable kvmclock if it was not enabled kvm_guest_cpu_offline() tries to disable kvmclock regardless if it is present in the VM. It leads to write to a MSR that doesn't exist on some configurations, namely in TDX guest: unchecked MSR access error: WRMSR to 0x12 (tried to write 0x0000000000000000) at rIP: 0xffffffff8110687c (kvmclock_disable+0x1c/0x30) kvmclock enabling is gated by CLOCKSOURCE and CLOCKSOURCE2 KVM paravirt features. Do not disable kvmclock if it was not enabled. Signed-off-by: Kirill A. Shutemov Fixes: c02027b5742b ("x86/kvm: Disable kvmclock on all CPUs on shutdown") Reviewed-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Wanpeng Li Cc: stable@vger.kernel.org Message-Id: <20231205004510.27164-6-kirill.shutemov@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index fb8f52149be9..f2fff625576d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,8 +24,8 @@ static int kvmclock __initdata = 1; static int kvmclock_vsyscall __initdata = 1; -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static int msr_kvm_system_time __ro_after_init; +static int msr_kvm_wall_clock __ro_after_init; static u64 kvm_sched_clock_offset __ro_after_init; static int __init parse_no_kvmclock(char *arg) @@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { - native_write_msr(msr_kvm_system_time, 0, 0); + if (msr_kvm_system_time) + native_write_msr(msr_kvm_system_time, 0, 0); } static void __init kvmclock_init_mem(void) @@ -294,7 +295,10 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + } else { return; } -- cgit v1.2.3