From bbbaf8cd1735770342aad4c6624a147ae321599b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:05 +0100 Subject: KVM: nVMX: Sanitize primary processor-based VM-execution controls with eVMCS too The only unsupported primary processor-based VM-execution control at the moment is CPU_BASED_ACTIVATE_TERTIARY_CONTROLS and KVM doesn't expose it in nested VMX feature MSRs anyway (see nested_vmx_setup_ctls_msrs()) but in preparation to inverting "unsupported with eVMCS" checks (and for completeness) it's better to sanitize MSR_IA32_VMX_PROCBASED_CTLS/ MSR_IA32_VMX_TRUE_PROCBASED_CTLS too. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index ae03d1fe0355..04ea0259ab7f 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -361,6 +361,7 @@ enum evmcs_revision { enum evmcs_ctrl_type { EVMCS_EXIT_CTRLS, EVMCS_ENTRY_CTRLS, + EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, @@ -374,6 +375,9 @@ static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_ENTRY_CTRLS] = { [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, }, + [EVMCS_EXEC_CTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_EXEC_CTRL, + }, [EVMCS_2NDEXEC] = { [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, }, @@ -434,6 +438,10 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; ctl_high &= ~unsupported_ctrls; break; + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_EXEC_CTRL); + break; case MSR_IA32_VMX_PROCBASED_CTLS2: ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); break; @@ -461,6 +469,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) vmcs12->pin_based_vm_exec_control))) return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL, + vmcs12->cpu_based_vm_exec_control))) + return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, vmcs12->secondary_vm_exec_control))) return -EINVAL; -- cgit v1.2.3 From 70b31e50fb5f6bb8addac1c4e5004c6af377ab68 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:06 +0100 Subject: KVM: nVMX: Invert 'unsupported by eVMCSv1' check When a new feature gets implemented in KVM, EVMCS1_UNSUPPORTED_* defines need to be adjusted to avoid the situation when the feature is exposed to the guest but there's no corresponding eVMCS field[s] for it. This is not obvious and fragile. Invert 'unsupported by eVMCSv1' check and make it 'supported by eVMCSv1' instead, this way it's much harder to make a mistake. New features will get added to EVMCS1_SUPPORTED_* defines when the corresponding fields are added to eVMCS definition. No functional change intended. EVMCS1_SUPPORTED_* defines are composed by taking KVM_{REQUIRED,OPTIONAL}_VMX_ defines and filtering out what was previously known as EVMCS1_UNSUPPORTED_*. From all the controls, SECONDARY_EXEC_TSC_SCALING requires special handling as it's actually present in eVMCSv1 definition but is not currently supported for Hyper-V-on-KVM, just for KVM-on-Hyper-V. As evmcs_supported_ctrls will be used for both scenarios, just add it there instead of EVMCS1_SUPPORTED_2NDEXEC. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 42 +++++++++++----------- arch/x86/kvm/vmx/hyperv.h | 90 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 96 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 04ea0259ab7f..77027f6a9b41 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -368,32 +368,32 @@ enum evmcs_ctrl_type { NR_EVMCS_CTRLS, }; -static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { +static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_EXIT_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL, }, [EVMCS_ENTRY_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL, }, [EVMCS_EXEC_CTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_EXEC_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL, }, [EVMCS_2NDEXEC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, }, [EVMCS_PINCTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, [EVMCS_VMFUNC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC, }, }; -static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type) { enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; - return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; + return evmcs_supported_ctrls[ctrl_type][evmcs_rev]; } static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) @@ -417,7 +417,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * { u32 ctl_low = (u32)*pdata; u32 ctl_high = (u32)(*pdata >> 32); - u32 unsupported_ctrls; + u32 supported_ctrls; /* * Hyper-V 2016 and 2019 try using these features even when eVMCS @@ -426,31 +426,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_EXEC_CTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL); break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); + ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL); break; case MSR_IA32_VMX_VMFUNC: - ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); + ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC); break; } @@ -460,7 +460,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, u32 val) { - return !(val & evmcs_get_unsupported_ctls(ctrl_type)); + return !(val & ~evmcs_get_supported_ctls(ctrl_type)); } int nested_evmcs_check_controls(struct vmcs12 *vmcs12) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 571e7929d14e..a9da47c69a2b 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -48,22 +48,82 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); * Currently unsupported in KVM: * GUEST_IA32_RTIT_CTL = 0x00002814, */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) struct evmcs_field { u16 offset; -- cgit v1.2.3 From 746b1833919eed3dd92c82466316471dfaba319e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:07 +0100 Subject: KVM: nVMX: Prepare to sanitize tertiary execution controls with eVMCS In preparation to restoring vmcs_conf sanitization for KVM-on-Hyper-V, (and for completeness) add tertiary VM-execution controls to 'evmcs_supported_ctrls'. No functional change intended as KVM doesn't yet expose MSR_IA32_VMX_PROCBASED_CTLS3 to its guests. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 4 ++++ arch/x86/kvm/vmx/hyperv.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 77027f6a9b41..a5cbd029c07b 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -363,6 +363,7 @@ enum evmcs_ctrl_type { EVMCS_ENTRY_CTRLS, EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, + EVMCS_3RDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, NR_EVMCS_CTRLS, @@ -381,6 +382,9 @@ static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_2NDEXEC] = { [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, }, + [EVMCS_3RDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC, + }, [EVMCS_PINCTRL] = { [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index a9da47c69a2b..3edbbd0991dc 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -98,6 +98,8 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_NOTIFY_VM_EXITING | \ SECONDARY_EXEC_ENCLS_EXITING) +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + #define EVMCS1_SUPPORTED_VMEXIT_CTRL \ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ VM_EXIT_SAVE_DEBUG_CONTROLS | \ -- cgit v1.2.3 From 1567037614af5f078c2361ca21d9b1a1cd42b8aa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:08 +0100 Subject: KVM: VMX: Resurrect vmcs_conf sanitization for KVM-on-Hyper-V Commit 9bcb90650e31 ("KVM: VMX: Get rid of eVMCS specific VMX controls sanitization") dropped 'vmcs_conf' sanitization for KVM-on-Hyper-V because there's no known Hyper-V version which would expose a feature unsupported in eVMCS in VMX feature MSRs. This works well for all currently existing Hyper-V version, however, future Hyper-V versions may add features which are supported by KVM and are currently missing in eVMCSv1 definition (e.g. APIC virtualization, PML,...). When this happens, existing KVMs will get broken. With the inverted 'unsupported by eVMCSv1' checks, we can resurrect vmcs_conf sanitization and make KVM future proof. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/hyperv.h | 1 + arch/x86/kvm/vmx/vmx.c | 5 +++++ 3 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index a5cbd029c07b..f773450fba6e 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kvm/hyper-v: " fmt + #include #include @@ -504,6 +506,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } +#if IS_ENABLED(CONFIG_HYPERV) +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} +#endif + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 3edbbd0991dc..883102d567c3 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -273,6 +273,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fc9008dbed33..0220e22b89ca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2752,6 +2752,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) + evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif + return 0; } -- cgit v1.2.3 From 641e6808586d76b477c00de804358982b95c8d50 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 12 Dec 2022 17:01:06 +0800 Subject: kvm: x86/mmu: Warn on linking when sp->unsync_children Since the commit 65855ed8b034 ("KVM: X86: Synchronize the shadow pagetable before link it"), no sp would be linked with sp->unsync_children = 1. So make it WARN if it is the case. Signed-off-by: Lai Jiangshan Message-Id: <20221212090106.378206-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4736d7849c60..26b32ae6b526 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2355,7 +2355,16 @@ static void __link_shadow_page(struct kvm *kvm, mmu_page_add_parent_pte(cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + /* + * The non-direct sub-pagetable must be updated before linking. For + * L1 sp, the pagetable is updated via kvm_sync_page() in + * kvm_mmu_find_shadow_page() without write-protecting the gfn, + * so sp->unsync can be true or false. For higher level non-direct + * sp, the pagetable is updated/synced via mmu_sync_children() in + * FNAME(fetch)(), so sp->unsync_children can only be false. + * WARN_ON_ONCE() if anything happens unexpectedly. + */ + if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } -- cgit v1.2.3 From 3af15ff47c4df3af7b36ea8315f43c6b0af49253 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:37 -0700 Subject: KVM: x86/mmu: Change tdp_mmu to a read-only parameter Change tdp_mmu to a read-only parameter and drop the per-vm tdp_mmu_enabled. For 32-bit KVM, make tdp_mmu_enabled a macro that is always false so that the compiler can continue omitting cals to the TDP MMU. The TDP MMU was introduced in 5.10 and has been enabled by default since 5.15. At this point there are no known functionality gaps between the TDP MMU and the shadow MMU, and the TDP MMU uses less memory and scales better with the number of vCPUs. In other words, there is no good reason to disable the TDP MMU on a live system. Purposely do not drop tdp_mmu=N support (i.e. do not force 64-bit KVM to always use the TDP MMU) since tdp_mmu=N is still used to get test coverage of KVM's shadow MMU TDP support, which is used in 32-bit KVM. Signed-off-by: David Matlack Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 13 ++--------- arch/x86/kvm/mmu.h | 6 ++--- arch/x86/kvm/mmu/mmu.c | 51 +++++++++++++++++++++++++++-------------- arch/x86/kvm/mmu/tdp_mmu.c | 9 ++------ 4 files changed, 41 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..aa4eb8cfcd7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1341,21 +1341,12 @@ struct kvm_arch { struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 - /* - * Whether the TDP MMU is enabled for this VM. This contains a - * snapshot of the TDP MMU module parameter from when the VM was - * created and remains unchanged for the life of the VM. If this is - * true, TDP MMU handler functions will run for various MMU - * operations. - */ - bool tdp_mmu_enabled; - /* The number of TDP MMU pages across all roots. */ atomic64_t tdp_mmu_pages; /* - * List of kvm_mmu_page structs being used as roots. - * All kvm_mmu_page structs in the list should have + * List of struct kvm_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6bdaacb6faa0..168c46fd8dd1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm) } #ifdef CONFIG_X86_64 -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +extern bool tdp_mmu_enabled; #else -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#define tdp_mmu_enabled false #endif static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); + return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26b32ae6b526..bd44e5682a64 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -99,6 +99,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; +bool __ro_after_init tdp_mmu_allowed; + +#ifdef CONFIG_X86_64 +bool __read_mostly tdp_mmu_enabled = true; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +#endif + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; @@ -1279,7 +1286,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); @@ -1312,7 +1319,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); @@ -1395,7 +1402,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, } } - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); @@ -1558,7 +1565,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; @@ -1571,7 +1578,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); return flush; @@ -1646,7 +1653,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; @@ -1659,7 +1666,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; @@ -3604,7 +3611,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (r < 0) goto out_unlock; - if (is_tdp_mmu_enabled(vcpu->kvm)) { + if (tdp_mmu_enabled) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { @@ -5727,6 +5734,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; +#ifdef CONFIG_X86_64 + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; +#endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when @@ -5974,7 +5984,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* @@ -5999,7 +6009,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } @@ -6111,7 +6121,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, gfn_end, true, flush); @@ -6144,7 +6154,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); @@ -6387,7 +6397,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, u64 start, u64 end, int target_level) { - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) @@ -6408,7 +6418,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { @@ -6491,7 +6501,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); @@ -6526,7 +6536,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); @@ -6561,7 +6571,7 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -6726,6 +6736,13 @@ void __init kvm_mmu_x86_module_init(void) if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + /* + * Snapshot userspace's desire to enable the TDP MMU. Whether or not the + * TDP MMU is actually enabled is determined in kvm_configure_mmu() + * when the vendor module is loaded. + */ + tdp_mmu_allowed = tdp_mmu_enabled; + kvm_mmu_spte_module_init(); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d6df38d371a0..03511e83050f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,23 +10,18 @@ #include #include -static bool __read_mostly tdp_mmu_enabled = true; -module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); - /* Initializes the TDP MMU for the VM, if enabled. */ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) + if (!tdp_mmu_enabled) return 0; wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; - /* This should not be changed for the lifetime of the VM. */ - kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); kvm->arch.tdp_mmu_zap_wq = wq; @@ -47,7 +42,7 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!kvm->arch.tdp_mmu_enabled) + if (!tdp_mmu_enabled) return; /* Also waits for any queued work items. */ -- cgit v1.2.3 From 991c8047b740f192a057d5f22df2f91f087cdb72 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:38 -0700 Subject: KVM: x86/mmu: Move TDP MMU VM init/uninit behind tdp_mmu_enabled Move kvm_mmu_{init,uninit}_tdp_mmu() behind tdp_mmu_enabled. This makes these functions consistent with the rest of the calls into the TDP MMU from mmu.c, and which is now possible since tdp_mmu_enabled is only modified when the x86 vendor module is loaded. i.e. It will never change during the lifetime of a VM. This change also enabled removing the stub definitions for 32-bit KVM, as the compiler will just optimize the calls out like it does for all the other TDP MMU functions. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 +++++++---- arch/x86/kvm/mmu/tdp_mmu.c | 6 ------ arch/x86/kvm/mmu/tdp_mmu.h | 7 +++---- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd44e5682a64..31bdf93711a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6035,9 +6035,11 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + if (tdp_mmu_enabled) { + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; + } node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -6067,7 +6069,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); - kvm_mmu_uninit_tdp_mmu(kvm); + if (tdp_mmu_enabled) + kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 03511e83050f..7e5952e95d3b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -15,9 +15,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_mmu_enabled) - return 0; - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; @@ -42,9 +39,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!tdp_mmu_enabled) - return; - /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d3714200b932..e4ab2dac269d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,6 +7,9 @@ #include "spte.h" +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) @@ -68,8 +71,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) @@ -89,8 +90,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif -- cgit v1.2.3 From 90c54c19f8021d9d284055dc246d605b559cdc22 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:39 -0700 Subject: KVM: x86/mmu: Grab mmu_invalidate_seq in kvm_faultin_pfn() Grab mmu_invalidate_seq in kvm_faultin_pfn() and stash it in struct kvm_page_fault. The eliminates duplicate code and reduces the amount of parameters needed for is_page_fault_stale(). Preemptively split out __kvm_faultin_pfn() to a separate function for use in subsequent commits. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++--------- arch/x86/kvm/mmu/mmu_internal.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 6 +----- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 31bdf93711a2..efc540e43e17 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4189,7 +4189,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -4250,12 +4250,20 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_CONTINUE; } +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + return __kvm_faultin_pfn(vcpu, fault); +} + /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, int mmu_seq) + struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); @@ -4275,14 +4283,12 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - - unsigned long mmu_seq; int r; fault->gfn = fault->addr >> PAGE_SHIFT; @@ -4299,9 +4305,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - r = kvm_faultin_pfn(vcpu, fault); if (r != RET_PF_CONTINUE) return r; @@ -4317,7 +4320,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index dbaf6755c5a7..1556f596a628 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -222,6 +222,7 @@ struct kvm_page_fault { struct kvm_memory_slot *slot; /* Outputs of kvm_faultin_pfn. */ + unsigned long mmu_seq; kvm_pfn_t pfn; hva_t hva; bool map_writable; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0f6455072055..88acf232494b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -791,7 +791,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - unsigned long mmu_seq; bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); @@ -838,9 +837,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - r = kvm_faultin_pfn(vcpu, fault); if (r != RET_PF_CONTINUE) return r; @@ -871,7 +867,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); -- cgit v1.2.3 From 7bd9645348ca865408b8c50491d605da1dfa05e3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:40 -0700 Subject: KVM: x86/mmu: Handle error PFNs in kvm_faultin_pfn() Handle error PFNs in kvm_faultin_pfn() rather than relying on the caller to invoke handle_abnormal_pfn() after kvm_faultin_pfn(). Opportunistically rename kvm_handle_bad_page() to kvm_handle_error_pfn() to make it more consistent with is_error_pfn(). This commit moves KVM closer to being able to drop handle_abnormal_pfn(), which will reduce the amount of duplicate code in the various page fault handlers. No functional change intended. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index efc540e43e17..d8a256f1b842 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3219,10 +3219,6 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { - /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); - if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -4252,10 +4248,19 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + int ret; + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); - return __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_faultin_pfn(vcpu, fault); + if (ret != RET_PF_CONTINUE) + return ret; + + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + + return RET_PF_CONTINUE; } /* -- cgit v1.2.3 From 897e4526e5e002818840e38d72e0585c45533fe8 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:41 -0700 Subject: KVM: x86/mmu: Avoid memslot lookup during KVM_PFN_ERR_HWPOISON handling Pass the kvm_page_fault struct down to kvm_handle_error_pfn() to avoid a memslot lookup when handling KVM_PFN_ERR_HWPOISON. Opportunistically move the gfn_to_hva_memslot() call and @current down into kvm_send_hwpoison_signal() to cut down on line lengths. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-6-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d8a256f1b842..b5f9f0755454 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3188,14 +3188,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); + unsigned long hva = gfn_to_hva_memslot(slot, gfn); + + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } -static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - if (is_sigpending_pfn(pfn)) { + if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -3205,11 +3207,11 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); + if (fault->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } @@ -4258,7 +4260,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault); return RET_PF_CONTINUE; } -- cgit v1.2.3 From f09948ec1ff3063b0722ef15195da3beca2d57c6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:42 -0700 Subject: KVM: x86/mmu: Handle no-slot faults in kvm_faultin_pfn() Handle faults on GFNs that do not have a backing memslot in kvm_faultin_pfn() and drop handle_abnormal_pfn(). This eliminates duplicate code in the various page fault handlers. Opportunistically tweak the comment about handling gfn > host.MAXPHYADDR to reflect that the effect of returning RET_PF_EMULATE at that point is to avoid creating an MMIO SPTE for such GFNs. No functional change intended. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-7-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-------------------- arch/x86/kvm/mmu/paging_tmpl.h | 6 +---- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b5f9f0755454..e2e8c4dfbaa5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3218,28 +3218,32 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa return -EFAULT; } -static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + unsigned int access) { - if (unlikely(!fault->slot)) { - gva_t gva = fault->is_tdp ? 0 : fault->addr; + gva_t gva = fault->is_tdp ? 0 : fault->addr; - vcpu_cache_mmio_info(vcpu, gva, fault->gfn, - access & shadow_mmio_access_mask); - /* - * If MMIO caching is disabled, emulate immediately without - * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. Do not cache MMIO - * whose gfn is greater than host.MAXPHYADDR, any guest that - * generates such gfns is running nested and is being tricked - * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if - * and only if L1's MAXPHYADDR is inaccurate with respect to - * the hardware's). - */ - if (unlikely(!enable_mmio_caching) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) - return RET_PF_EMULATE; - } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, + access & shadow_mmio_access_mask); + + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!enable_mmio_caching)) + return RET_PF_EMULATE; + + /* + * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, + * any guest that generates such gfns is running nested and is being + * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and + * only if L1's MAXPHYADDR is inaccurate with respect to the + * hardware's). + */ + if (unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; return RET_PF_CONTINUE; } @@ -4248,7 +4252,8 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access) { int ret; @@ -4262,6 +4267,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); + if (unlikely(!fault->slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + return RET_PF_CONTINUE; } @@ -4312,11 +4320,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 88acf232494b..e5662dbd519c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -837,11 +837,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; -- cgit v1.2.3 From 2d75ce03005dfaf29b52ea885ddd960ba5c1d8c2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:43 -0700 Subject: KVM: x86/mmu: Initialize fault.{gfn,slot} earlier for direct MMUs Move the initialization of fault.{gfn,slot} earlier in the page fault handling code for fully direct MMUs. This will enable a future commit to split out TDP MMU page fault handling without needing to duplicate the initialization of these 2 fields. Opportunistically take advantage of the fact that fault.gfn is initialized in kvm_tdp_page_fault() rather than recomputing it from fault->addr. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-8-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +---- arch/x86/kvm/mmu/mmu_internal.h | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e8c4dfbaa5..cc2683419ff2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4306,9 +4306,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); int r; - fault->gfn = fault->addr >> PAGE_SHIFT; - fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); - if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4412,7 +4409,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); + gfn_t base = fault->gfn & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1556f596a628..069890789faf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -280,6 +280,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, }; int r; + if (vcpu->arch.mmu->root_role.direct) { + fault.gfn = fault.addr >> PAGE_SHIFT; + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + /* * Async #PF "faults", a.k.a. prefetch faults, are not faults from the * guest perspective and have already been counted at the time of the -- cgit v1.2.3 From a158127f55b98c02c2f1ef96981db088cad5c2e7 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:44 -0700 Subject: KVM: x86/mmu: Split out TDP MMU page fault handling Split out the page fault handling for the TDP MMU to a separate function. This creates some duplicate code, but makes the TDP MMU fault handler simpler to read by eliminating branches and will enable future cleanups by allowing the TDP MMU and non-TDP MMU fault paths to diverge. Only compile in the TDP MMU fault handler for 64-bit builds since kvm_tdp_mmu_map() does not exist in 32-bit builds. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-9-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 62 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cc2683419ff2..7b10bff3d2e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4303,7 +4303,6 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); int r; if (page_fault_handle_page_track(vcpu, fault)) @@ -4322,11 +4321,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return r; r = RET_PF_RETRY; - - if (is_tdp_mmu_fault) - read_lock(&vcpu->kvm->mmu_lock); - else - write_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; @@ -4335,16 +4330,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) goto out_unlock; - if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, fault); - else - r = __direct_map(vcpu, fault); + r = __direct_map(vcpu, fault); out_unlock: - if (is_tdp_mmu_fault) - read_unlock(&vcpu->kvm->mmu_lock); - else - write_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } @@ -4392,6 +4381,46 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +#ifdef CONFIG_X86_64 +static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int r; + + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_EMULATE; + + r = fast_page_fault(vcpu, fault); + if (r != RET_PF_INVALID) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) + return r; + + r = RET_PF_RETRY; + read_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; + + r = make_mmu_pages_available(vcpu); + if (r) + goto out_unlock; + + r = kvm_tdp_mmu_map(vcpu, fault); + +out_unlock: + read_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(fault->pfn); + return r; +} +#endif + int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* @@ -4416,6 +4445,11 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif + return direct_page_fault(vcpu, fault); } -- cgit v1.2.3 From 1290f90e77186bf8a06a3a35ebf254f5b004676b Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:45 -0700 Subject: KVM: x86/mmu: Stop needlessly making MMU pages available for TDP MMU faults Stop calling make_mmu_pages_available() when handling TDP MMU faults. The TDP MMU does not participate in the "available MMU pages" tracking and limiting so calling this function is unnecessary work when handling TDP MMU faults. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-10-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b10bff3d2e7..dfac473d188a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4408,10 +4408,6 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (is_page_fault_stale(vcpu, fault)) goto out_unlock; - r = make_mmu_pages_available(vcpu); - if (r) - goto out_unlock; - r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: -- cgit v1.2.3 From 362871d74ff67f23534a5ecb448967b0507d0848 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:46 -0700 Subject: KVM: x86/mmu: Rename __direct_map() to direct_map() Rename __direct_map() to direct_map() since the leading underscores are unnecessary. This also makes the page fault handler names more consistent: kvm_tdp_mmu_page_fault() calls kvm_tdp_mmu_map() and direct_page_fault() calls direct_map(). Opportunistically make some trivial cleanups to comments that had to be modified anyway since they mentioned __direct_map(). Specifically, use "()" when referring to functions, and include kvm_tdp_mmu_map() among the various callers of disallowed_hugepage_adjust(). No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-11-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++------- arch/x86/kvm/mmu/mmu_internal.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dfac473d188a..7fb7a07a389e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3131,11 +3131,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* - * A small SPTE exists for this pfn, but FNAME(fetch) - * and __direct_map would like to create a large PTE - * instead: just force them to go down another level, - * patching back for them into pfn the next 9 bits of - * the address. + * A small SPTE exists for this pfn, but FNAME(fetch), + * direct_map(), or kvm_tdp_mmu_map() would like to create a + * large PTE instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of the + * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); @@ -3144,7 +3144,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ } } -static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -4330,7 +4330,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) goto out_unlock; - r = __direct_map(vcpu, fault); + r = direct_map(vcpu, fault); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 069890789faf..ac00bfbf32f6 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -199,7 +199,7 @@ struct kvm_page_fault { /* * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; -- cgit v1.2.3 From aeb568a1a6041e3d69def54046747bbd989bc4ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:17:00 +0000 Subject: KVM: x86/mmu: Replace open coded usage of tdp_mmu_page with is_tdp_mmu_page() Use is_tdp_mmu_page() instead of querying sp->tdp_mmu_page directly so that all users benefit if KVM ever finds a way to optimize the logic. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7fb7a07a389e..27b233eff6e5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1928,7 +1928,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return true; /* TDP MMU pages do not use the MMU generation. */ - return !sp->tdp_mmu_page && + return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7e5952e95d3b..cc1fb9a65620 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -133,7 +133,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!root->tdp_mmu_page); + WARN_ON(!is_tdp_mmu_page(root)); /* * The root now has refcount=0. It is valid, but readers already -- cgit v1.2.3 From f2e4535c273af02efe5f58ec201cf7e14a6c895a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:16:59 +0000 Subject: KVM: x86/mmu: Pivot on "TDP MMU enabled" to check if active MMU is TDP MMU Simplify and optimize the logic for detecting if the current/active MMU is a TDP MMU. If the TDP MMU is globally enabled, then the active MMU is a TDP MMU if it is direct. When TDP is enabled, so called nonpaging MMUs are never used as the only form of shadow paging KVM uses is for nested TDP, and the active MMU can't be direct in that case. Rename the helper and take the vCPU instead of an arbitrary MMU, as nonpaging MMUs can show up in the walk_mmu if L1 is using nested TDP and L2 has paging disabled. Taking the vCPU has the added bonus of cleaning up the callers, all of which check the current MMU but wrap code that consumes the vCPU. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-9-seanjc@google.com> [Use tdp_mmu_enabled variable. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++++--- arch/x86/kvm/mmu/tdp_mmu.h | 18 ------------------ 2 files changed, 8 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 27b233eff6e5..c5193bb56bfe 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -616,9 +616,14 @@ static bool mmu_spte_age(u64 *sptep) return true; } +static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) +{ + return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* @@ -637,7 +642,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* @@ -4043,7 +4048,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu(vcpu->arch.mmu)) + if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index e4ab2dac269d..0a63b1afabd3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -72,26 +72,8 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } - -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) -{ - struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root.hpa; - - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - /* - * A NULL shadow page is legal when shadowing a non-paging guest with - * PAE paging, as the MMU will be direct with root_hpa pointing at the - * pae_root page, not a shadow page. - */ - sp = to_shadow_page(hpa); - return sp && is_tdp_mmu_page(sp) && sp->root_count; -} #else static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- cgit v1.2.3 From 94fbbfbbdffdce23d9c37e65da711adbc1bfe642 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:16:58 +0000 Subject: KVM: x86/mmu: Pivot on "TDP MMU enabled" when handling direct page faults When handling direct page faults, pivot on the TDP MMU being globally enabled instead of checking if the target MMU is a TDP MMU. Now that the TDP MMU is all-or-nothing, if the TDP MMU is enabled, KVM will reach direct_page_fault() if and only if the MMU is a TDP MMU. When TDP is enabled (obviously required for the TDP MMU), only non-nested TDP page faults reach direct_page_fault(), i.e. nonpaging MMUs are impossible, as NPT requires paging to be enabled and EPT faults use ept_page_fault(). Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-8-seanjc@google.com> [Use tdp_mmu_enabled variable. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c5193bb56bfe..15d389370f88 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3372,7 +3372,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) do { u64 new_spte; - if (is_tdp_mmu(vcpu->arch.mmu)) + if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); -- cgit v1.2.3 From 916dde51a418c7fa1efc62b3509d5a3f90ea0176 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:43 +0200 Subject: x86/hyperv: Add HV_EXPOSE_INVARIANT_TSC define Avoid open coding BIT(0) of HV_X64_MSR_TSC_INVARIANT_CONTROL by adding a dedicated define. While there's only one user at this moment, the upcoming KVM implementation of Hyper-V Invariant TSC feature will need to use it as well. Reviewed-by: Michael Kelley Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 3 +++ arch/x86/kernel/cpu/mshyperv.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e3efaf6e6b62..617332dd64ac 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -255,6 +255,9 @@ enum hv_isolation_type { /* TSC invariant control */ #define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ +#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) + /* Register name aliases for temporary compatibility */ #define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT #define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 831613959a92..e402923800d7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -388,7 +388,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } -- cgit v1.2.3 From 24652b741cf62e6a149aabf0949dc3a4f16593fd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:44 +0200 Subject: KVM: x86: Add a KVM-only leaf for CPUID_8000_0007_EDX CPUID_8000_0007_EDX may come handy when X86_FEATURE_CONSTANT_TSC needs to be checked. No functional change intended. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 ++++++-- arch/x86/kvm/reverse_cpuid.h | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0b5bf013fcb8..ff342c9da172 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -701,6 +701,10 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, + SF(CONSTANT_TSC) + ); + kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | @@ -1153,8 +1157,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ - /* invariant TSC is CPUID.80000007H:EDX[8] */ - entry->edx &= (1 << 8); + cpuid_entry_override(entry, CPUID_8000_0007_EDX); + /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 203fdad07bae..25b9b51abb20 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -14,6 +14,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, + CPUID_8000_0007_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -42,6 +43,9 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* CPUID level 0x80000007 (EDX). */ +#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) + struct cpuid_reg { u32 function; u32 index; @@ -67,6 +71,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, + [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, }; /* @@ -97,6 +102,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_CONSTANT_TSC) + return KVM_X86_FEATURE_CONSTANT_TSC; return x86_feature; } -- cgit v1.2.3 From 4e5bf89f2794e99d8d61b2fcb4a8f2d29963a532 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:45 +0200 Subject: KVM: x86: Hyper-V invariant TSC control Normally, genuine Hyper-V doesn't expose architectural invariant TSC (CPUID.80000007H:EDX[8]) to its guests by default. A special PV MSR (HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x40000118) and corresponding CPUID feature bit (CPUID.0x40000003.EAX[15]) were introduced. When bit 0 of the PV MSR is set, invariant TSC bit starts to show up in CPUID. When the feature is exposed to Hyper-V guests, reenlightenment becomes unneeded. Add the feature to KVM. Keep CPUID output intact when the feature wasn't exposed to L1 and implement the required logic for hiding invariant TSC when the feature was exposed and invariant TSC control MSR wasn't written to. Copy genuine Hyper-V behavior and forbid to disable the feature once it was enabled. For the reference, for linux guests, support for the feature was added in commit dce7cd62754b ("x86/hyperv: Allow guests to enable InvariantTSC"). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 3 +++ arch/x86/kvm/hyperv.c | 19 +++++++++++++++++++ arch/x86/kvm/hyperv.h | 27 +++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 4 +++- 5 files changed, 53 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa4eb8cfcd7e..c70690b2c82d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1088,6 +1088,7 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ff342c9da172..c3f084185daf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1488,6 +1488,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(F(RTM) | F(HLE)); + } else if (function == 0x80000007) { + if (kvm_hv_invtsc_suppressed(vcpu)) + *edx &= ~SF(CONSTANT_TSC); } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e8296942a868..80d082ecd5c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -999,6 +999,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; @@ -1283,6 +1284,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & @@ -1410,6 +1414,17 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); @@ -1585,6 +1600,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); @@ -2733,6 +2751,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9f96414a31c5..f83b8db72b11 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) HV_SYNIC_STIMER_COUNT); } +/* + * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8]) + * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to. + */ +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * If Hyper-V's invariant TSC control is not exposed to the guest, + * the invariant TSC CPUID flag is not suppressed, Windows guests were + * observed to be able to handle it correctly. Going forward, VMMs are + * encouraged to enable Hyper-V's invariant TSC control when invariant + * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V. + */ + if (!hv_vcpu || + !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT)) + return false; + + /* + * If Hyper-V's invariant TSC control is exposed to the guest, KVM is + * responsible for suppressing the invariant TSC CPUID flag if the + * Hyper-V control is not enabled. + */ + return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC); +} + void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c3ce39cdccb..39b8dd37bc40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1480,7 +1480,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, @@ -3821,6 +3821,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -4191,6 +4192,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); -- cgit v1.2.3 From a2ad080b4dfd6bf97738bc79f8a6eca5790c9ae5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:05 +0100 Subject: KVM: nVMX: Sanitize primary processor-based VM-execution controls with eVMCS too The only unsupported primary processor-based VM-execution control at the moment is CPU_BASED_ACTIVATE_TERTIARY_CONTROLS and KVM doesn't expose it in nested VMX feature MSRs anyway (see nested_vmx_setup_ctls_msrs()) but in preparation to inverting "unsupported with eVMCS" checks (and for completeness) it's better to sanitize MSR_IA32_VMX_PROCBASED_CTLS/ MSR_IA32_VMX_TRUE_PROCBASED_CTLS too. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index ae03d1fe0355..04ea0259ab7f 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -361,6 +361,7 @@ enum evmcs_revision { enum evmcs_ctrl_type { EVMCS_EXIT_CTRLS, EVMCS_ENTRY_CTRLS, + EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, @@ -374,6 +375,9 @@ static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_ENTRY_CTRLS] = { [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, }, + [EVMCS_EXEC_CTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_EXEC_CTRL, + }, [EVMCS_2NDEXEC] = { [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, }, @@ -434,6 +438,10 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; ctl_high &= ~unsupported_ctrls; break; + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_EXEC_CTRL); + break; case MSR_IA32_VMX_PROCBASED_CTLS2: ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); break; @@ -461,6 +469,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) vmcs12->pin_based_vm_exec_control))) return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL, + vmcs12->cpu_based_vm_exec_control))) + return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, vmcs12->secondary_vm_exec_control))) return -EINVAL; -- cgit v1.2.3 From 96d6955d215e6234bb820fd23756b2a9b77aef0f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:06 +0100 Subject: KVM: nVMX: Invert 'unsupported by eVMCSv1' check When a new feature gets implemented in KVM, EVMCS1_UNSUPPORTED_* defines need to be adjusted to avoid the situation when the feature is exposed to the guest but there's no corresponding eVMCS field[s] for it. This is not obvious and fragile. Invert 'unsupported by eVMCSv1' check and make it 'supported by eVMCSv1' instead, this way it's much harder to make a mistake. New features will get added to EVMCS1_SUPPORTED_* defines when the corresponding fields are added to eVMCS definition. No functional change intended. EVMCS1_SUPPORTED_* defines are composed by taking KVM_{REQUIRED,OPTIONAL}_VMX_ defines and filtering out what was previously known as EVMCS1_UNSUPPORTED_*. From all the controls, SECONDARY_EXEC_TSC_SCALING requires special handling as it's actually present in eVMCSv1 definition but is not currently supported for Hyper-V-on-KVM, just for KVM-on-Hyper-V. As evmcs_supported_ctrls will be used for both scenarios, just add it there instead of EVMCS1_SUPPORTED_2NDEXEC. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 42 +++++++++++----------- arch/x86/kvm/vmx/hyperv.h | 90 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 96 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 04ea0259ab7f..77027f6a9b41 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -368,32 +368,32 @@ enum evmcs_ctrl_type { NR_EVMCS_CTRLS, }; -static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { +static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_EXIT_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL, }, [EVMCS_ENTRY_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL, }, [EVMCS_EXEC_CTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_EXEC_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL, }, [EVMCS_2NDEXEC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, }, [EVMCS_PINCTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, [EVMCS_VMFUNC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC, }, }; -static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type) { enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; - return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; + return evmcs_supported_ctrls[ctrl_type][evmcs_rev]; } static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) @@ -417,7 +417,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * { u32 ctl_low = (u32)*pdata; u32 ctl_high = (u32)(*pdata >> 32); - u32 unsupported_ctrls; + u32 supported_ctrls; /* * Hyper-V 2016 and 2019 try using these features even when eVMCS @@ -426,31 +426,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_EXEC_CTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL); break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); + ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL); break; case MSR_IA32_VMX_VMFUNC: - ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); + ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC); break; } @@ -460,7 +460,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, u32 val) { - return !(val & evmcs_get_unsupported_ctls(ctrl_type)); + return !(val & ~evmcs_get_supported_ctls(ctrl_type)); } int nested_evmcs_check_controls(struct vmcs12 *vmcs12) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 571e7929d14e..a9da47c69a2b 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -48,22 +48,82 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); * Currently unsupported in KVM: * GUEST_IA32_RTIT_CTL = 0x00002814, */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) struct evmcs_field { u16 offset; -- cgit v1.2.3 From c128d3fd389b583ce6b28dbaa0e924de5671b961 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:07 +0100 Subject: KVM: nVMX: Prepare to sanitize tertiary execution controls with eVMCS In preparation to restoring vmcs_conf sanitization for KVM-on-Hyper-V, (and for completeness) add tertiary VM-execution controls to 'evmcs_supported_ctrls'. No functional change intended as KVM doesn't yet expose MSR_IA32_VMX_PROCBASED_CTLS3 to its guests. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 4 ++++ arch/x86/kvm/vmx/hyperv.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 77027f6a9b41..a5cbd029c07b 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -363,6 +363,7 @@ enum evmcs_ctrl_type { EVMCS_ENTRY_CTRLS, EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, + EVMCS_3RDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, NR_EVMCS_CTRLS, @@ -381,6 +382,9 @@ static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_2NDEXEC] = { [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, }, + [EVMCS_3RDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC, + }, [EVMCS_PINCTRL] = { [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index a9da47c69a2b..3edbbd0991dc 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -98,6 +98,8 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_NOTIFY_VM_EXITING | \ SECONDARY_EXEC_ENCLS_EXITING) +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + #define EVMCS1_SUPPORTED_VMEXIT_CTRL \ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ VM_EXIT_SAVE_DEBUG_CONTROLS | \ -- cgit v1.2.3 From 80edc49f6a7544fa0fac122e177ffb63af60651e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 4 Nov 2022 15:47:08 +0100 Subject: KVM: VMX: Resurrect vmcs_conf sanitization for KVM-on-Hyper-V Commit 9bcb90650e31 ("KVM: VMX: Get rid of eVMCS specific VMX controls sanitization") dropped 'vmcs_conf' sanitization for KVM-on-Hyper-V because there's no known Hyper-V version which would expose a feature unsupported in eVMCS in VMX feature MSRs. This works well for all currently existing Hyper-V version, however, future Hyper-V versions may add features which are supported by KVM and are currently missing in eVMCSv1 definition (e.g. APIC virtualization, PML,...). When this happens, existing KVMs will get broken. With the inverted 'unsupported by eVMCSv1' checks, we can resurrect vmcs_conf sanitization and make KVM future proof. Signed-off-by: Vitaly Kuznetsov Message-Id: <20221104144708.435865-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/hyperv.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/hyperv.h | 1 + arch/x86/kvm/vmx/vmx.c | 5 +++++ 3 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index a5cbd029c07b..f773450fba6e 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kvm/hyper-v: " fmt + #include #include @@ -504,6 +506,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } +#if IS_ENABLED(CONFIG_HYPERV) +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} +#endif + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 3edbbd0991dc..883102d567c3 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -273,6 +273,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fc9008dbed33..0220e22b89ca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2752,6 +2752,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) + evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif + return 0; } -- cgit v1.2.3 From c4a488685b84ad94f099aa6be066bf1488e4b73c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 12 Dec 2022 17:01:06 +0800 Subject: kvm: x86/mmu: Warn on linking when sp->unsync_children Since the commit 65855ed8b034 ("KVM: X86: Synchronize the shadow pagetable before link it"), no sp would be linked with sp->unsync_children = 1. So make it WARN if it is the case. Signed-off-by: Lai Jiangshan Message-Id: <20221212090106.378206-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4736d7849c60..26b32ae6b526 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2355,7 +2355,16 @@ static void __link_shadow_page(struct kvm *kvm, mmu_page_add_parent_pte(cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + /* + * The non-direct sub-pagetable must be updated before linking. For + * L1 sp, the pagetable is updated via kvm_sync_page() in + * kvm_mmu_find_shadow_page() without write-protecting the gfn, + * so sp->unsync can be true or false. For higher level non-direct + * sp, the pagetable is updated/synced via mmu_sync_children() in + * FNAME(fetch)(), so sp->unsync_children can only be false. + * WARN_ON_ONCE() if anything happens unexpectedly. + */ + if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } -- cgit v1.2.3 From 1f98f2bd8ec4df365381ba654db37a39ce20ccdf Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:37 -0700 Subject: KVM: x86/mmu: Change tdp_mmu to a read-only parameter Change tdp_mmu to a read-only parameter and drop the per-vm tdp_mmu_enabled. For 32-bit KVM, make tdp_mmu_enabled a macro that is always false so that the compiler can continue omitting cals to the TDP MMU. The TDP MMU was introduced in 5.10 and has been enabled by default since 5.15. At this point there are no known functionality gaps between the TDP MMU and the shadow MMU, and the TDP MMU uses less memory and scales better with the number of vCPUs. In other words, there is no good reason to disable the TDP MMU on a live system. Purposely do not drop tdp_mmu=N support (i.e. do not force 64-bit KVM to always use the TDP MMU) since tdp_mmu=N is still used to get test coverage of KVM's shadow MMU TDP support, which is used in 32-bit KVM. Signed-off-by: David Matlack Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 13 ++--------- arch/x86/kvm/mmu.h | 6 ++--- arch/x86/kvm/mmu/mmu.c | 51 +++++++++++++++++++++++++++-------------- arch/x86/kvm/mmu/tdp_mmu.c | 9 ++------ 4 files changed, 41 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..aa4eb8cfcd7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1341,21 +1341,12 @@ struct kvm_arch { struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 - /* - * Whether the TDP MMU is enabled for this VM. This contains a - * snapshot of the TDP MMU module parameter from when the VM was - * created and remains unchanged for the life of the VM. If this is - * true, TDP MMU handler functions will run for various MMU - * operations. - */ - bool tdp_mmu_enabled; - /* The number of TDP MMU pages across all roots. */ atomic64_t tdp_mmu_pages; /* - * List of kvm_mmu_page structs being used as roots. - * All kvm_mmu_page structs in the list should have + * List of struct kvm_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6bdaacb6faa0..168c46fd8dd1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm) } #ifdef CONFIG_X86_64 -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +extern bool tdp_mmu_enabled; #else -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#define tdp_mmu_enabled false #endif static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); + return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26b32ae6b526..bd44e5682a64 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -99,6 +99,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; +bool __ro_after_init tdp_mmu_allowed; + +#ifdef CONFIG_X86_64 +bool __read_mostly tdp_mmu_enabled = true; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +#endif + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; @@ -1279,7 +1286,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); @@ -1312,7 +1319,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); @@ -1395,7 +1402,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, } } - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); @@ -1558,7 +1565,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; @@ -1571,7 +1578,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); return flush; @@ -1646,7 +1653,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; @@ -1659,7 +1666,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; @@ -3604,7 +3611,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (r < 0) goto out_unlock; - if (is_tdp_mmu_enabled(vcpu->kvm)) { + if (tdp_mmu_enabled) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { @@ -5727,6 +5734,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; +#ifdef CONFIG_X86_64 + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; +#endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when @@ -5974,7 +5984,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* @@ -5999,7 +6009,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } @@ -6111,7 +6121,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, gfn_end, true, flush); @@ -6144,7 +6154,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); @@ -6387,7 +6397,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, u64 start, u64 end, int target_level) { - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) @@ -6408,7 +6418,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { @@ -6491,7 +6501,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); @@ -6526,7 +6536,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); @@ -6561,7 +6571,7 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -6726,6 +6736,13 @@ void __init kvm_mmu_x86_module_init(void) if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + /* + * Snapshot userspace's desire to enable the TDP MMU. Whether or not the + * TDP MMU is actually enabled is determined in kvm_configure_mmu() + * when the vendor module is loaded. + */ + tdp_mmu_allowed = tdp_mmu_enabled; + kvm_mmu_spte_module_init(); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d6df38d371a0..03511e83050f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,23 +10,18 @@ #include #include -static bool __read_mostly tdp_mmu_enabled = true; -module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); - /* Initializes the TDP MMU for the VM, if enabled. */ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) + if (!tdp_mmu_enabled) return 0; wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; - /* This should not be changed for the lifetime of the VM. */ - kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); kvm->arch.tdp_mmu_zap_wq = wq; @@ -47,7 +42,7 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!kvm->arch.tdp_mmu_enabled) + if (!tdp_mmu_enabled) return; /* Also waits for any queued work items. */ -- cgit v1.2.3 From 09732d2b4dc5ba9de504711a3ae5aa1199127e6f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:38 -0700 Subject: KVM: x86/mmu: Move TDP MMU VM init/uninit behind tdp_mmu_enabled Move kvm_mmu_{init,uninit}_tdp_mmu() behind tdp_mmu_enabled. This makes these functions consistent with the rest of the calls into the TDP MMU from mmu.c, and which is now possible since tdp_mmu_enabled is only modified when the x86 vendor module is loaded. i.e. It will never change during the lifetime of a VM. This change also enabled removing the stub definitions for 32-bit KVM, as the compiler will just optimize the calls out like it does for all the other TDP MMU functions. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 +++++++---- arch/x86/kvm/mmu/tdp_mmu.c | 6 ------ arch/x86/kvm/mmu/tdp_mmu.h | 7 +++---- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd44e5682a64..31bdf93711a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6035,9 +6035,11 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + if (tdp_mmu_enabled) { + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; + } node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -6067,7 +6069,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); - kvm_mmu_uninit_tdp_mmu(kvm); + if (tdp_mmu_enabled) + kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 03511e83050f..7e5952e95d3b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -15,9 +15,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_mmu_enabled) - return 0; - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; @@ -42,9 +39,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!tdp_mmu_enabled) - return; - /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d3714200b932..e4ab2dac269d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,6 +7,9 @@ #include "spte.h" +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) @@ -68,8 +71,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) @@ -89,8 +90,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif -- cgit v1.2.3 From ba6e3fe255439fd401a8d5487ad7c030db14afa8 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:39 -0700 Subject: KVM: x86/mmu: Grab mmu_invalidate_seq in kvm_faultin_pfn() Grab mmu_invalidate_seq in kvm_faultin_pfn() and stash it in struct kvm_page_fault. The eliminates duplicate code and reduces the amount of parameters needed for is_page_fault_stale(). Preemptively split out __kvm_faultin_pfn() to a separate function for use in subsequent commits. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++--------- arch/x86/kvm/mmu/mmu_internal.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 6 +----- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 31bdf93711a2..efc540e43e17 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4189,7 +4189,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -4250,12 +4250,20 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_CONTINUE; } +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + return __kvm_faultin_pfn(vcpu, fault); +} + /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, int mmu_seq) + struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); @@ -4275,14 +4283,12 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - - unsigned long mmu_seq; int r; fault->gfn = fault->addr >> PAGE_SHIFT; @@ -4299,9 +4305,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - r = kvm_faultin_pfn(vcpu, fault); if (r != RET_PF_CONTINUE) return r; @@ -4317,7 +4320,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index dbaf6755c5a7..1556f596a628 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -222,6 +222,7 @@ struct kvm_page_fault { struct kvm_memory_slot *slot; /* Outputs of kvm_faultin_pfn. */ + unsigned long mmu_seq; kvm_pfn_t pfn; hva_t hva; bool map_writable; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0f6455072055..88acf232494b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -791,7 +791,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - unsigned long mmu_seq; bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); @@ -838,9 +837,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - r = kvm_faultin_pfn(vcpu, fault); if (r != RET_PF_CONTINUE) return r; @@ -871,7 +867,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); -- cgit v1.2.3 From 56c3a4e4a2d5ef36ac12156e9d4d793e2841135c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:40 -0700 Subject: KVM: x86/mmu: Handle error PFNs in kvm_faultin_pfn() Handle error PFNs in kvm_faultin_pfn() rather than relying on the caller to invoke handle_abnormal_pfn() after kvm_faultin_pfn(). Opportunistically rename kvm_handle_bad_page() to kvm_handle_error_pfn() to make it more consistent with is_error_pfn(). This commit moves KVM closer to being able to drop handle_abnormal_pfn(), which will reduce the amount of duplicate code in the various page fault handlers. No functional change intended. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index efc540e43e17..d8a256f1b842 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3219,10 +3219,6 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { - /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); - if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -4252,10 +4248,19 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + int ret; + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); - return __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_faultin_pfn(vcpu, fault); + if (ret != RET_PF_CONTINUE) + return ret; + + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + + return RET_PF_CONTINUE; } /* -- cgit v1.2.3 From cd08d178ff45d6564e782a3d005922ed439bd77f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:41 -0700 Subject: KVM: x86/mmu: Avoid memslot lookup during KVM_PFN_ERR_HWPOISON handling Pass the kvm_page_fault struct down to kvm_handle_error_pfn() to avoid a memslot lookup when handling KVM_PFN_ERR_HWPOISON. Opportunistically move the gfn_to_hva_memslot() call and @current down into kvm_send_hwpoison_signal() to cut down on line lengths. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-6-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d8a256f1b842..b5f9f0755454 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3188,14 +3188,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); + unsigned long hva = gfn_to_hva_memslot(slot, gfn); + + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } -static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - if (is_sigpending_pfn(pfn)) { + if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -3205,11 +3207,11 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); + if (fault->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } @@ -4258,7 +4260,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault); return RET_PF_CONTINUE; } -- cgit v1.2.3 From 354c908c068ec0a21850aef5dbf6b8331434a2e9 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:42 -0700 Subject: KVM: x86/mmu: Handle no-slot faults in kvm_faultin_pfn() Handle faults on GFNs that do not have a backing memslot in kvm_faultin_pfn() and drop handle_abnormal_pfn(). This eliminates duplicate code in the various page fault handlers. Opportunistically tweak the comment about handling gfn > host.MAXPHYADDR to reflect that the effect of returning RET_PF_EMULATE at that point is to avoid creating an MMIO SPTE for such GFNs. No functional change intended. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-7-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-------------------- arch/x86/kvm/mmu/paging_tmpl.h | 6 +---- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b5f9f0755454..e2e8c4dfbaa5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3218,28 +3218,32 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa return -EFAULT; } -static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + unsigned int access) { - if (unlikely(!fault->slot)) { - gva_t gva = fault->is_tdp ? 0 : fault->addr; + gva_t gva = fault->is_tdp ? 0 : fault->addr; - vcpu_cache_mmio_info(vcpu, gva, fault->gfn, - access & shadow_mmio_access_mask); - /* - * If MMIO caching is disabled, emulate immediately without - * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. Do not cache MMIO - * whose gfn is greater than host.MAXPHYADDR, any guest that - * generates such gfns is running nested and is being tricked - * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if - * and only if L1's MAXPHYADDR is inaccurate with respect to - * the hardware's). - */ - if (unlikely(!enable_mmio_caching) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) - return RET_PF_EMULATE; - } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, + access & shadow_mmio_access_mask); + + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!enable_mmio_caching)) + return RET_PF_EMULATE; + + /* + * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, + * any guest that generates such gfns is running nested and is being + * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and + * only if L1's MAXPHYADDR is inaccurate with respect to the + * hardware's). + */ + if (unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; return RET_PF_CONTINUE; } @@ -4248,7 +4252,8 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access) { int ret; @@ -4262,6 +4267,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); + if (unlikely(!fault->slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + return RET_PF_CONTINUE; } @@ -4312,11 +4320,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 88acf232494b..e5662dbd519c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -837,11 +837,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; -- cgit v1.2.3 From e5e6f8d254a285e97b02180fdc75d02ffdf9f0ed Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:43 -0700 Subject: KVM: x86/mmu: Initialize fault.{gfn,slot} earlier for direct MMUs Move the initialization of fault.{gfn,slot} earlier in the page fault handling code for fully direct MMUs. This will enable a future commit to split out TDP MMU page fault handling without needing to duplicate the initialization of these 2 fields. Opportunistically take advantage of the fact that fault.gfn is initialized in kvm_tdp_page_fault() rather than recomputing it from fault->addr. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-8-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +---- arch/x86/kvm/mmu/mmu_internal.h | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e8c4dfbaa5..cc2683419ff2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4306,9 +4306,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); int r; - fault->gfn = fault->addr >> PAGE_SHIFT; - fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); - if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4412,7 +4409,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); + gfn_t base = fault->gfn & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1556f596a628..069890789faf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -280,6 +280,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, }; int r; + if (vcpu->arch.mmu->root_role.direct) { + fault.gfn = fault.addr >> PAGE_SHIFT; + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + /* * Async #PF "faults", a.k.a. prefetch faults, are not faults from the * guest perspective and have already been counted at the time of the -- cgit v1.2.3 From 9aa8ab43b38146029de807a8ff2696f51e15b226 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:44 -0700 Subject: KVM: x86/mmu: Split out TDP MMU page fault handling Split out the page fault handling for the TDP MMU to a separate function. This creates some duplicate code, but makes the TDP MMU fault handler simpler to read by eliminating branches and will enable future cleanups by allowing the TDP MMU and non-TDP MMU fault paths to diverge. Only compile in the TDP MMU fault handler for 64-bit builds since kvm_tdp_mmu_map() does not exist in 32-bit builds. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-9-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 62 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cc2683419ff2..7b10bff3d2e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4303,7 +4303,6 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); int r; if (page_fault_handle_page_track(vcpu, fault)) @@ -4322,11 +4321,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return r; r = RET_PF_RETRY; - - if (is_tdp_mmu_fault) - read_lock(&vcpu->kvm->mmu_lock); - else - write_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; @@ -4335,16 +4330,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) goto out_unlock; - if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, fault); - else - r = __direct_map(vcpu, fault); + r = __direct_map(vcpu, fault); out_unlock: - if (is_tdp_mmu_fault) - read_unlock(&vcpu->kvm->mmu_lock); - else - write_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } @@ -4392,6 +4381,46 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +#ifdef CONFIG_X86_64 +static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int r; + + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_EMULATE; + + r = fast_page_fault(vcpu, fault); + if (r != RET_PF_INVALID) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) + return r; + + r = RET_PF_RETRY; + read_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; + + r = make_mmu_pages_available(vcpu); + if (r) + goto out_unlock; + + r = kvm_tdp_mmu_map(vcpu, fault); + +out_unlock: + read_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(fault->pfn); + return r; +} +#endif + int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* @@ -4416,6 +4445,11 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif + return direct_page_fault(vcpu, fault); } -- cgit v1.2.3 From 9f33697ac7ff6945cc06cffb37423176680e2ddf Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:45 -0700 Subject: KVM: x86/mmu: Stop needlessly making MMU pages available for TDP MMU faults Stop calling make_mmu_pages_available() when handling TDP MMU faults. The TDP MMU does not participate in the "available MMU pages" tracking and limiting so calling this function is unnecessary work when handling TDP MMU faults. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-10-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b10bff3d2e7..dfac473d188a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4408,10 +4408,6 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (is_page_fault_stale(vcpu, fault)) goto out_unlock; - r = make_mmu_pages_available(vcpu); - if (r) - goto out_unlock; - r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: -- cgit v1.2.3 From 6c882ef4fc7bd99b67ad152e75428b669281c521 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 21 Sep 2022 10:35:46 -0700 Subject: KVM: x86/mmu: Rename __direct_map() to direct_map() Rename __direct_map() to direct_map() since the leading underscores are unnecessary. This also makes the page fault handler names more consistent: kvm_tdp_mmu_page_fault() calls kvm_tdp_mmu_map() and direct_page_fault() calls direct_map(). Opportunistically make some trivial cleanups to comments that had to be modified anyway since they mentioned __direct_map(). Specifically, use "()" when referring to functions, and include kvm_tdp_mmu_map() among the various callers of disallowed_hugepage_adjust(). No functional change intended. Signed-off-by: David Matlack Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Message-Id: <20220921173546.2674386-11-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++------- arch/x86/kvm/mmu/mmu_internal.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dfac473d188a..7fb7a07a389e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3131,11 +3131,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* - * A small SPTE exists for this pfn, but FNAME(fetch) - * and __direct_map would like to create a large PTE - * instead: just force them to go down another level, - * patching back for them into pfn the next 9 bits of - * the address. + * A small SPTE exists for this pfn, but FNAME(fetch), + * direct_map(), or kvm_tdp_mmu_map() would like to create a + * large PTE instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of the + * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); @@ -3144,7 +3144,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ } } -static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -4330,7 +4330,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) goto out_unlock; - r = __direct_map(vcpu, fault); + r = direct_map(vcpu, fault); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 069890789faf..ac00bfbf32f6 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -199,7 +199,7 @@ struct kvm_page_fault { /* * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; -- cgit v1.2.3 From de0322f575be9e7fb598d14d58f5a7c7d8394e2b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:17:00 +0000 Subject: KVM: x86/mmu: Replace open coded usage of tdp_mmu_page with is_tdp_mmu_page() Use is_tdp_mmu_page() instead of querying sp->tdp_mmu_page directly so that all users benefit if KVM ever finds a way to optimize the logic. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7fb7a07a389e..27b233eff6e5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1928,7 +1928,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return true; /* TDP MMU pages do not use the MMU generation. */ - return !sp->tdp_mmu_page && + return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7e5952e95d3b..cc1fb9a65620 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -133,7 +133,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!root->tdp_mmu_page); + WARN_ON(!is_tdp_mmu_page(root)); /* * The root now has refcount=0. It is valid, but readers already -- cgit v1.2.3 From 78fdd2f09fb1dfe152fe4a151c484786f1ed6984 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:16:59 +0000 Subject: KVM: x86/mmu: Pivot on "TDP MMU enabled" to check if active MMU is TDP MMU Simplify and optimize the logic for detecting if the current/active MMU is a TDP MMU. If the TDP MMU is globally enabled, then the active MMU is a TDP MMU if it is direct. When TDP is enabled, so called nonpaging MMUs are never used as the only form of shadow paging KVM uses is for nested TDP, and the active MMU can't be direct in that case. Rename the helper and take the vCPU instead of an arbitrary MMU, as nonpaging MMUs can show up in the walk_mmu if L1 is using nested TDP and L2 has paging disabled. Taking the vCPU has the added bonus of cleaning up the callers, all of which check the current MMU but wrap code that consumes the vCPU. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-9-seanjc@google.com> [Use tdp_mmu_enabled variable. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++++--- arch/x86/kvm/mmu/tdp_mmu.h | 18 ------------------ 2 files changed, 8 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 27b233eff6e5..c5193bb56bfe 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -616,9 +616,14 @@ static bool mmu_spte_age(u64 *sptep) return true; } +static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) +{ + return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* @@ -637,7 +642,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* @@ -4043,7 +4048,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu(vcpu->arch.mmu)) + if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index e4ab2dac269d..0a63b1afabd3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -72,26 +72,8 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } - -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) -{ - struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root.hpa; - - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - /* - * A NULL shadow page is legal when shadowing a non-paging guest with - * PAE paging, as the MMU will be direct with root_hpa pointing at the - * pae_root page, not a shadow page. - */ - sp = to_shadow_page(hpa); - return sp && is_tdp_mmu_page(sp) && sp->root_count; -} #else static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- cgit v1.2.3 From dfe0ecc6f5d32da13783d203403bd3ecacbd3179 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Oct 2022 18:16:58 +0000 Subject: KVM: x86/mmu: Pivot on "TDP MMU enabled" when handling direct page faults When handling direct page faults, pivot on the TDP MMU being globally enabled instead of checking if the target MMU is a TDP MMU. Now that the TDP MMU is all-or-nothing, if the TDP MMU is enabled, KVM will reach direct_page_fault() if and only if the MMU is a TDP MMU. When TDP is enabled (obviously required for the TDP MMU), only non-nested TDP page faults reach direct_page_fault(), i.e. nonpaging MMUs are impossible, as NPT requires paging to be enabled and EPT faults use ept_page_fault(). Signed-off-by: Sean Christopherson Message-Id: <20221012181702.3663607-8-seanjc@google.com> [Use tdp_mmu_enabled variable. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c5193bb56bfe..15d389370f88 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3372,7 +3372,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) do { u64 new_spte; - if (is_tdp_mmu(vcpu->arch.mmu)) + if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); -- cgit v1.2.3 From b961aa757f940bebc608a412f66951e52a39a298 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:43 +0200 Subject: x86/hyperv: Add HV_EXPOSE_INVARIANT_TSC define Avoid open coding BIT(0) of HV_X64_MSR_TSC_INVARIANT_CONTROL by adding a dedicated define. While there's only one user at this moment, the upcoming KVM implementation of Hyper-V Invariant TSC feature will need to use it as well. Reviewed-by: Michael Kelley Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 3 +++ arch/x86/kernel/cpu/mshyperv.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e3efaf6e6b62..617332dd64ac 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -255,6 +255,9 @@ enum hv_isolation_type { /* TSC invariant control */ #define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ +#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) + /* Register name aliases for temporary compatibility */ #define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT #define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 831613959a92..e402923800d7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -388,7 +388,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } -- cgit v1.2.3 From 0fcf86f05af208e4e6c62cf1a7e7e28bc6c9d9f9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:44 +0200 Subject: KVM: x86: Add a KVM-only leaf for CPUID_8000_0007_EDX CPUID_8000_0007_EDX may come handy when X86_FEATURE_CONSTANT_TSC needs to be checked. No functional change intended. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 ++++++-- arch/x86/kvm/reverse_cpuid.h | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0b5bf013fcb8..ff342c9da172 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -701,6 +701,10 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, + SF(CONSTANT_TSC) + ); + kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | @@ -1153,8 +1157,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ - /* invariant TSC is CPUID.80000007H:EDX[8] */ - entry->edx &= (1 << 8); + cpuid_entry_override(entry, CPUID_8000_0007_EDX); + /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 203fdad07bae..25b9b51abb20 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -14,6 +14,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, + CPUID_8000_0007_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -42,6 +43,9 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* CPUID level 0x80000007 (EDX). */ +#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) + struct cpuid_reg { u32 function; u32 index; @@ -67,6 +71,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, + [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, }; /* @@ -97,6 +102,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_CONSTANT_TSC) + return KVM_X86_FEATURE_CONSTANT_TSC; return x86_feature; } -- cgit v1.2.3 From 2be1bd3a70c81835b8969216ceedbd017fea732d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 Oct 2022 11:58:45 +0200 Subject: KVM: x86: Hyper-V invariant TSC control Normally, genuine Hyper-V doesn't expose architectural invariant TSC (CPUID.80000007H:EDX[8]) to its guests by default. A special PV MSR (HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x40000118) and corresponding CPUID feature bit (CPUID.0x40000003.EAX[15]) were introduced. When bit 0 of the PV MSR is set, invariant TSC bit starts to show up in CPUID. When the feature is exposed to Hyper-V guests, reenlightenment becomes unneeded. Add the feature to KVM. Keep CPUID output intact when the feature wasn't exposed to L1 and implement the required logic for hiding invariant TSC when the feature was exposed and invariant TSC control MSR wasn't written to. Copy genuine Hyper-V behavior and forbid to disable the feature once it was enabled. For the reference, for linux guests, support for the feature was added in commit dce7cd62754b ("x86/hyperv: Allow guests to enable InvariantTSC"). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Message-Id: <20221013095849.705943-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 3 +++ arch/x86/kvm/hyperv.c | 19 +++++++++++++++++++ arch/x86/kvm/hyperv.h | 27 +++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 4 +++- 5 files changed, 53 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa4eb8cfcd7e..c70690b2c82d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1088,6 +1088,7 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ff342c9da172..c3f084185daf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1488,6 +1488,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(F(RTM) | F(HLE)); + } else if (function == 0x80000007) { + if (kvm_hv_invtsc_suppressed(vcpu)) + *edx &= ~SF(CONSTANT_TSC); } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e8296942a868..80d082ecd5c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -999,6 +999,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; @@ -1283,6 +1284,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & @@ -1410,6 +1414,17 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); @@ -1585,6 +1600,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); @@ -2733,6 +2751,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9f96414a31c5..f83b8db72b11 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) HV_SYNIC_STIMER_COUNT); } +/* + * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8]) + * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to. + */ +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * If Hyper-V's invariant TSC control is not exposed to the guest, + * the invariant TSC CPUID flag is not suppressed, Windows guests were + * observed to be able to handle it correctly. Going forward, VMMs are + * encouraged to enable Hyper-V's invariant TSC control when invariant + * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V. + */ + if (!hv_vcpu || + !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT)) + return false; + + /* + * If Hyper-V's invariant TSC control is exposed to the guest, KVM is + * responsible for suppressing the invariant TSC CPUID flag if the + * Hyper-V control is not enabled. + */ + return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC); +} + void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c3ce39cdccb..39b8dd37bc40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1480,7 +1480,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, @@ -3821,6 +3821,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -4191,6 +4192,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); -- cgit v1.2.3 From 1935542a04cc12cb360885a0ff5583d5e259c32f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:51 +0000 Subject: KVM: x86: Do timer initialization after XCR0 configuration Move kvm_arch_init()'s call to kvm_timer_init() down a few lines below the XCR0 configuration code. A future patch will move hardware setup into kvm_arch_init() and slot in vendor hardware setup before the call to kvm_timer_init() so that timer initialization (among other stuff) doesn't need to be unwound if vendor setup fails. XCR0 setup on the other hand needs to happen before vendor hardware setup. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6ef44dc8a0e..41bbedf5d2de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9341,13 +9341,13 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_timer_init(); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } + kvm_timer_init(); + if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 -- cgit v1.2.3 From b7483387e374977396152298bfbddee1ca0a062e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:52 +0000 Subject: KVM: x86: Move hardware setup/unsetup to init/exit Now that kvm_arch_hardware_setup() is called immediately after kvm_arch_init(), fold the guts of kvm_arch_hardware_(un)setup() into kvm_arch_{init,exit}() as a step towards dropping one of the hooks. To avoid having to unwind various setup, e.g registration of several notifiers, slot in the vendor hardware setup before the registration of said notifiers and callbacks. Introducing a functional change while moving code is less than ideal, but the alternative is adding a pile of unwinding code, which is much more error prone, e.g. several attempts to move the setup code verbatim all introduced bugs. Add a comment to document that kvm_ops_update() is effectively the point of no return, e.g. it sets the kvm_x86_ops.hardware_enable canary and so needs to be unwound. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 121 ++++++++++++++++++++++++++++------------------------- 1 file changed, 63 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41bbedf5d2de..4e7a71cab828 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9273,6 +9273,24 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; @@ -9346,6 +9364,24 @@ int kvm_arch_init(void *opaque) kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_ops_update(ops); + kvm_timer_init(); if (pi_inject_timer == -1) @@ -9357,8 +9393,32 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_caps.has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; + } + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_init_msr_list(); return 0; +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9368,6 +9428,8 @@ out_free_x86_emulator_cache: void kvm_arch_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9384,6 +9446,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif + static_call(kvm_x86_hardware_unsetup)(); kvm_x86_ops.hardware_enable = NULL; kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); @@ -11970,72 +12033,14 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) -{ - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include -#undef __KVM_X86_OP - - kvm_pmu_ops_update(ops->pmu_ops); -} - int kvm_arch_hardware_setup(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - kvm_init_pmu_capability(); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - kvm_ops_update(ops); - - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_caps.has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); - kvm_caps.max_guest_tsc_khz = max; - } - kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - kvm_unregister_perf_callbacks(); - static_call(kvm_x86_hardware_unsetup)(); } int kvm_arch_check_processor_compat(void *opaque) -- cgit v1.2.3 From 63a1bd8ad1ac9e4e8bfcd5914c8899606e04898d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:53 +0000 Subject: KVM: Drop arch hardware (un)setup hooks Drop kvm_arch_hardware_setup() and kvm_arch_hardware_unsetup() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Message-Id: <20221130230934.1014142-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/arm.c | 5 ----- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kvm/mips.c | 5 ----- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/include/asm/kvm_host.h | 1 - arch/riscv/kvm/main.c | 5 ----- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 7 ------- 12 files changed, 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..f50951c51d3b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -943,7 +943,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void) void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9c5573bc4614..e6c21b804c13 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,11 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return 0; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5cedb28e8a40..28f0ba97db71 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -888,7 +888,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a25e0b73ee70..af29490d9740 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,11 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return 0; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..5d2c3a487e73 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -877,7 +877,6 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_CREATE_DEVICE static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 04494a4fb37a..5faf69421f13 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -440,11 +440,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return kvmppc_core_check_processor_compat(); diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 93f43a3e7886..a79b7d1514db 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -230,7 +230,6 @@ struct kvm_vcpu_arch { bool pause; }; -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 58c5489d3031..afd6400a9e80 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -25,11 +25,6 @@ int kvm_arch_check_processor_compat(void *opaque) return 0; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 97c7ccd189eb..829e6e046003 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -329,16 +329,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e7a71cab828..089f0eeea850 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12033,16 +12033,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - -} - int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f26b244f6d0..7c1009c0e66d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1447,8 +1447,6 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); -int kvm_arch_hardware_setup(void *opaque); -void kvm_arch_hardware_unsetup(void); int kvm_arch_check_processor_compat(void *opaque); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 70264378da41..8393347fd35f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5931,10 +5931,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) return r; - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto err_hw_setup; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto err_hw_enabled; @@ -6027,8 +6023,6 @@ out_free_3: out_free_2: free_cpumask_var(cpus_hardware_enabled); err_hw_enabled: - kvm_arch_hardware_unsetup(); -err_hw_setup: kvm_arch_exit(); return r; } @@ -6057,7 +6051,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_hardware_unsetup(); kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3 From 2916b70fc342719f570640de07251b7f91feebdb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:54 +0000 Subject: KVM: VMX: Reset eVMCS controls in VP assist page during hardware disabling Reset the eVMCS controls in the per-CPU VP assist page during hardware disabling instead of waiting until kvm-intel's module exit. The controls are activated if and only if KVM creates a VM, i.e. don't need to be reset if hardware is never enabled. Doing the reset during hardware disabling will naturally fix a potential NULL pointer deref bug once KVM disables CPU hotplug while enabling and disabling hardware (which is necessary to fix a variety of bugs). If the kernel is running as the root partition, the VP assist page is unmapped during CPU hot unplug, and so KVM's clearing of the eVMCS controls needs to occur with CPU hot(un)plug disabled, otherwise KVM could attempt to write to a CPU's VP assist page after it's unmapped. Reported-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Message-Id: <20221130230934.1014142-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0220e22b89ca..a68d1adfd66b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -551,6 +551,33 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -2527,6 +2554,8 @@ static void vmx_hardware_disable(void) if (cpu_vmxoff()) kvm_spurious_fault(); + hv_reset_evmcs(); + intel_pt_handle_vmx(0); } @@ -8505,27 +8534,8 @@ static void vmx_exit(void) kvm_exit(); #if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 0; - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - + if (static_branch_unlikely(&enable_evmcs)) static_branch_disable(&enable_evmcs); - } #endif vmx_cleanup_l1d_flush(); -- cgit v1.2.3 From da66de44b01e9b7fa09731057593850394bf32e4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:55 +0000 Subject: KVM: VMX: Don't bother disabling eVMCS static key on module exit Don't disable the eVMCS static key on module exit, kvm_intel.ko owns the key so there can't possibly be users after the kvm_intel.ko is unloaded, at least not without much bigger issues. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a68d1adfd66b..d9139738dd2c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8533,10 +8533,6 @@ static void vmx_exit(void) kvm_exit(); -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) - static_branch_disable(&enable_evmcs); -#endif vmx_cleanup_l1d_flush(); allow_smaller_maxphyaddr = false; -- cgit v1.2.3 From 451d39e8006111c4449830e01ec460024bf58817 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:56 +0000 Subject: KVM: VMX: Move Hyper-V eVMCS initialization to helper Move Hyper-V's eVMCS initialization to a dedicated helper to clean up vmx_init(), and add a comment to call out that the Hyper-V init code doesn't need to be unwound if vmx_init() ultimately fails. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Message-Id: <20221130230934.1014142-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 73 +++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d9139738dd2c..703d81edfacb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -523,6 +523,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) +static struct kvm_x86_ops vmx_x86_ops __initdata; + static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -551,6 +553,43 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static __init void hv_init_evmcs(void) +{ + int cpu; + + if (!enlightened_vmcs) + return; + + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. + */ + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; + + } else { + enlightened_vmcs = false; + } +} + static void hv_reset_evmcs(void) { struct hv_vp_assist_page *vp_ap; @@ -577,6 +616,7 @@ static void hv_reset_evmcs(void) } #else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_init_evmcs(void) {} static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ @@ -8543,38 +8583,11 @@ static int __init vmx_init(void) { int r, cpu; -#if IS_ENABLED(CONFIG_HYPERV) /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. + * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing + * to unwind if a later step fails. */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { - - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } - - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush - = hv_enable_l2_tlb_flush; - - } else { - enlightened_vmcs = false; - } -#endif + hv_init_evmcs(); r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); -- cgit v1.2.3 From 4f8396b96a9fc672964842fe7adbe8ddca8a3adf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:57 +0000 Subject: KVM: x86: Move guts of kvm_arch_init() to standalone helper Move the guts of kvm_arch_init() to a new helper, kvm_x86_vendor_init(), so that VMX can do _all_ arch and vendor initialization before calling kvm_init(). Calling kvm_init() must be the _very_ last step during init, as kvm_init() exposes /dev/kvm to userspace, i.e. allows creating VMs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 23 +++++++++++++++++++++-- arch/x86/kvm/vmx/vmx.c | 21 +++++++++++++++------ arch/x86/kvm/x86.c | 15 +++++++++++++-- 4 files changed, 52 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c70690b2c82d..4899505955b5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1751,6 +1751,9 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9a194aa1a75a..a477eeecc8aa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5097,15 +5097,34 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 703d81edfacb..1e2eab6bda16 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8572,6 +8572,7 @@ static void vmx_exit(void) #endif kvm_exit(); + kvm_x86_vendor_exit(); vmx_cleanup_l1d_flush(); @@ -8589,23 +8590,25 @@ static int __init vmx_init(void) */ hv_init_evmcs(); + r = kvm_x86_vendor_init(&vmx_init_ops); + if (r) + return r; + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - return r; + goto err_kvm_init; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8630,5 +8633,11 @@ static int __init vmx_init(void) allow_smaller_maxphyaddr = true; return 0; + +err_l1d_flush: + vmx_exit(); +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 089f0eeea850..9837f57d5004 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9293,7 +9293,16 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) int kvm_arch_init(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; + return 0; +} + +void kvm_arch_exit(void) +{ + +} + +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ u64 host_pat; int r; @@ -9425,8 +9434,9 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); -void kvm_arch_exit(void) +void kvm_x86_vendor_exit(void) { kvm_unregister_perf_callbacks(); @@ -9456,6 +9466,7 @@ void kvm_arch_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { -- cgit v1.2.3 From e32b120071ea114efc0b4ddd439547750b85f618 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:58 +0000 Subject: KVM: VMX: Do _all_ initialization before exposing /dev/kvm to userspace Call kvm_init() only after _all_ setup is complete, as kvm_init() exposes /dev/kvm to userspace and thus allows userspace to create VMs (and call other ioctls). E.g. KVM will encounter a NULL pointer when attempting to add a vCPU to the per-CPU loaded_vmcss_on_cpu list if userspace is able to create a VM before vmx_init() configures said list. BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP CPU: 6 PID: 1143 Comm: stable Not tainted 6.0.0-rc7+ #988 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:vmx_vcpu_load_vmcs+0x68/0x230 [kvm_intel] vmx_vcpu_load+0x16/0x60 [kvm_intel] kvm_arch_vcpu_load+0x32/0x1f0 [kvm] vcpu_load+0x2f/0x40 [kvm] kvm_arch_vcpu_create+0x231/0x310 [kvm] kvm_vm_ioctl+0x79f/0xe10 [kvm] ? handle_mm_fault+0xb1/0x220 __x64_sys_ioctl+0x80/0xb0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f5a6b05743b Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel(+) kvm irqbypass Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1e2eab6bda16..e0e3f2c1f681 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8564,19 +8564,23 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void vmx_exit(void) +static void __vmx_exit(void) { + allow_smaller_maxphyaddr = false; + #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif + vmx_cleanup_l1d_flush(); +} +static void vmx_exit(void) +{ kvm_exit(); kvm_x86_vendor_exit(); - vmx_cleanup_l1d_flush(); - - allow_smaller_maxphyaddr = false; + __vmx_exit(); } module_exit(vmx_exit); @@ -8594,11 +8598,6 @@ static int __init vmx_init(void) if (r) return r; - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); - if (r) - goto err_kvm_init; - /* * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in @@ -8632,11 +8631,20 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); + if (r) + goto err_kvm_init; + return 0; -err_l1d_flush: - vmx_exit(); err_kvm_init: + __vmx_exit(); +err_l1d_flush: kvm_x86_vendor_exit(); return r; } -- cgit v1.2.3 From 3af4a9e61e71117d5df2df3e1605fa062e04b869 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:59 +0000 Subject: KVM: x86: Serialize vendor module initialization (hardware setup) Acquire a new mutex, vendor_module_lock, in kvm_x86_vendor_init() while doing hardware setup to ensure that concurrent calls are fully serialized. KVM rejects attempts to load vendor modules if a different module has already been loaded, but doesn't handle the case where multiple vendor modules are loaded at the same time, and module_init() doesn't run under the global module_mutex. Note, in practice, this is likely a benign bug as no platform exists that supports both SVM and VMX, i.e. barring a weird VM setup, one of the vendor modules is guaranteed to fail a support check before modifying common KVM state. Alternatively, KVM could perform an atomic CMPXCHG on .hardware_enable, but that comes with its own ugliness as it would require setting .hardware_enable before success is guaranteed, e.g. attempting to load the "wrong" could result in spurious failure to load the "right" module. Introduce a new mutex as using kvm_lock is extremely deadlock prone due to kvm_lock being taken under cpus_write_lock(), and in the future, under under cpus_read_lock(). Any operation that takes cpus_read_lock() while holding kvm_lock would potentially deadlock, e.g. kvm_timer_init() takes cpus_read_lock() to register a callback. In theory, KVM could avoid such problematic paths, i.e. do less setup under kvm_lock, but avoiding all calls to cpus_read_lock() is subtly difficult and thus fragile. E.g. updating static calls also acquires cpus_read_lock(). Inverting the lock ordering, i.e. always taking kvm_lock outside cpus_read_lock(), is not a viable option as kvm_lock is taken in various callbacks that may be invoked under cpus_read_lock(), e.g. x86's kvmclock_cpufreq_notifier(). The lockdep splat below is dependent on future patches to take cpus_read_lock() in hardware_enable_all(), but as above, deadlock is already is already possible. ====================================================== WARNING: possible circular locking dependency detected 6.0.0-smp--7ec93244f194-init2 #27 Tainted: G O ------------------------------------------------------ stable/251833 is trying to acquire lock: ffffffffc097ea28 (kvm_lock){+.+.}-{3:3}, at: hardware_enable_all+0x1f/0xc0 [kvm] but task is already holding lock: ffffffffa2456828 (cpu_hotplug_lock){++++}-{0:0}, at: hardware_enable_all+0xf/0xc0 [kvm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x2a/0xa0 __cpuhp_setup_state+0x2b/0x60 __kvm_x86_vendor_init+0x16a/0x1870 [kvm] kvm_x86_vendor_init+0x23/0x40 [kvm] 0xffffffffc0a4d02b do_one_initcall+0x110/0x200 do_init_module+0x4f/0x250 load_module+0x1730/0x18f0 __se_sys_finit_module+0xca/0x100 __x64_sys_finit_module+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (kvm_lock){+.+.}-{3:3}: __lock_acquire+0x16f4/0x30d0 lock_acquire+0xb2/0x190 __mutex_lock+0x98/0x6f0 mutex_lock_nested+0x1b/0x20 hardware_enable_all+0x1f/0xc0 [kvm] kvm_dev_ioctl+0x45e/0x930 [kvm] __se_sys_ioctl+0x77/0xc0 __x64_sys_ioctl+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug_lock); lock(kvm_lock); lock(cpu_hotplug_lock); lock(kvm_lock); *** DEADLOCK *** 1 lock held by stable/251833: #0: ffffffffa2456828 (cpu_hotplug_lock){++++}-{0:0}, at: hardware_enable_all+0xf/0xc0 [kvm] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 6 ++++++ arch/x86/kvm/x86.c | 18 ++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index a3ca76f9be75..0f6067d13622 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -291,3 +291,9 @@ time it will be set using the Dirty tracking mechanism described above. wakeup notification event since external interrupts from the assigned devices happens, we will find the vCPU on the list to wakeup. + +``vendor_module_lock`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:Type: mutex +:Arch: x86 +:Protects: loading a vendor module (kvm_amd or kvm_intel) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9837f57d5004..78cb6a46dd50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -128,6 +128,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -9301,7 +9302,7 @@ void kvm_arch_exit(void) } -int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { u64 host_pat; int r; @@ -9434,6 +9435,17 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } + +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) @@ -9457,7 +9469,6 @@ void kvm_x86_vendor_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif static_call(kvm_x86_hardware_unsetup)(); - kvm_x86_ops.hardware_enable = NULL; kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9465,6 +9476,9 @@ void kvm_x86_vendor_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); -- cgit v1.2.3 From a578a0a9e3526483ad1904fac019d95e7089fb34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:13 +0000 Subject: KVM: Drop kvm_arch_{init,exit}() hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_init() and kvm_arch_exit() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Reviewed-by: Philippe Mathieu-Daudé Acked-by: Anup Patel Message-Id: <20221130230934.1014142-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 11 ----------- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/main.c | 9 --------- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 3 --- virt/kvm/kvm_main.c | 19 ++----------------- 9 files changed, 2 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ea8057fa113f..04ed741f7b9d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2301,17 +2301,6 @@ out_err: return err; } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - -} - static int __init early_kvm_mode_cfg(char *arg) { if (!arg) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index ae7a24342fdf..3cade648827a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1010,16 +1010,6 @@ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return r; } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 5d2c3a487e73..0a80e80c7b9e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -881,7 +881,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d44b85ba8cef..01d0f9935e6c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2539,11 +2539,6 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param) } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); -int kvm_arch_init(void *opaque) -{ - return 0; -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 1cd220e96c26..be951feee27a 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -65,15 +65,6 @@ void kvm_arch_hardware_disable(void) csr_write(CSR_HIDELEG, 0); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ -} - static int __init riscv_kvm_init(void) { const char *str; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 66d162723d21..25b08b956888 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -541,16 +541,6 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78cb6a46dd50..accf5a7ab8df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9292,16 +9292,6 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) kvm_pmu_ops_update(ops->pmu_ops); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { u64 host_pat; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c1009c0e66d..62d977bb1448 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1423,9 +1423,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); -int kvm_arch_init(void *opaque); -void kvm_arch_exit(void); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8393347fd35f..597ff734b312 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5921,20 +5921,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - /* - * FIXME: Get rid of kvm_arch_init(), vendor code should call arch code - * directly. Note, kvm_arch_init() _must_ be called before anything - * else as x86 relies on checks buried in kvm_arch_init() to guard - * against multiple calls to kvm_init(). - */ - r = kvm_arch_init(opaque); - if (r) - return r; - - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto err_hw_enabled; - } + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) + return -ENOMEM; c.ret = &r; c.opaque = opaque; @@ -6022,8 +6010,6 @@ out_free_3: cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: free_cpumask_var(cpus_hardware_enabled); -err_hw_enabled: - kvm_arch_exit(); return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6051,7 +6037,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3 From 58ca1930310a0f45f6250d23e6ea8a09af1a133b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:14 +0000 Subject: KVM: VMX: Make VMCS configuration/capabilities structs read-only after init Tag vmcs_config and vmx_capability structs as __init, the canonical configuration is generated during hardware_setup() and must never be modified after that point. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-31-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cd2ac9536c99..45162c1bcd8f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -66,13 +66,13 @@ struct vmcs_config { u64 misc; struct nested_vmx_msrs nested; }; -extern struct vmcs_config vmcs_config; +extern struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability { u32 ept; u32 vpid; }; -extern struct vmx_capability vmx_capability; +extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e0e3f2c1f681..32ca9c2f2885 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -488,8 +488,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -struct vmcs_config vmcs_config; -struct vmx_capability vmx_capability; +struct vmcs_config vmcs_config __ro_after_init; +struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ -- cgit v1.2.3 From 3045c483eeeed56d2e709334cd590c1cfb748610 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:15 +0000 Subject: KVM: x86: Do CPU compatibility checks in x86 code Move the CPU compatibility checks to pure x86 code, i.e. drop x86's use of the common kvm_x86_check_cpu_compat() arch hook. x86 is the only architecture that "needs" to do per-CPU compatibility checks, moving the logic to x86 will allow dropping the common code, and will also give x86 more control over when/how the compatibility checks are performed, e.g. TDX will need to enable hardware (do VMXON) in order to perform compatibility checks. Signed-off-by: Sean Christopherson Reviewed-by: Isaku Yamahata Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-32-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 40 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a477eeecc8aa..e55f8e54e25a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5109,7 +5109,7 @@ static int __init svm_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), + r = kvm_init(NULL, sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32ca9c2f2885..e996034809de 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8635,7 +8635,7 @@ static int __init vmx_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), + r = kvm_init(NULL, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index accf5a7ab8df..d94e6e8692cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9292,10 +9292,36 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) kvm_pmu_ops_update(ops->pmu_ops); } +struct kvm_cpu_compat_check { + struct kvm_x86_init_ops *ops; + int *ret; +}; + +static int kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops *ops) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + WARN_ON(!irqs_disabled()); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return ops->check_processor_compatibility(); +} + +static void kvm_x86_check_cpu_compat(void *data) +{ + struct kvm_cpu_compat_check *c = data; + + *c->ret = kvm_x86_check_processor_compatibility(c->ops); +} + static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { + struct kvm_cpu_compat_check c; u64 host_pat; - int r; + int r, cpu; if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); @@ -9375,6 +9401,14 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r != 0) goto out_mmu_exit; + c.ret = &r; + c.ops = ops; + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &c, 1); + if (r < 0) + goto out_hardware_unsetup; + } + /* * Point of no return! DO NOT add error paths below this point unless * absolutely necessary, as most operations from this point forward @@ -9417,6 +9451,8 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_msr_list(); return 0; +out_hardware_unsetup: + ops->runtime_ops->hardware_unsetup(); out_mmu_exit: kvm_mmu_vendor_module_exit(); out_free_percpu: @@ -12050,16 +12086,7 @@ void kvm_arch_hardware_disable(void) int kvm_arch_check_processor_compat(void *opaque) { - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - struct kvm_x86_init_ops *ops = opaque; - - WARN_ON(!irqs_disabled()); - - if (__cr4_reserved_bits(cpu_has, c) != - __cr4_reserved_bits(cpu_has, &boot_cpu_data)) - return -EIO; - - return ops->check_processor_compatibility(); + return 0; } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 81a1cf9f89a6b71e71bfd7d43837ce9235e70b38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:16 +0000 Subject: KVM: Drop kvm_arch_check_processor_compat() hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_check_processor_compat() and its support code now that all architecture implementations are nops. Signed-off-by: Sean Christopherson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 7 +------ arch/mips/kvm/mips.c | 7 +------ arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/e500.c | 2 +- arch/powerpc/kvm/e500mc.c | 2 +- arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/main.c | 7 +------ arch/s390/kvm/kvm-s390.c | 7 +------ arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 5 ----- include/linux/kvm_host.h | 4 +--- virt/kvm/kvm_main.c | 24 +----------------------- 13 files changed, 13 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 04ed741f7b9d..698787ed87e9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,11 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -2285,7 +2280,7 @@ static __init int kvm_arm_init(void) * FIXME: Do something reasonable if kvm_init() fails after pKVM * hypervisor protection is finalized. */ - err = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (err) goto out_subs; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3cade648827a..36c8991b5d39 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,11 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) @@ -1636,7 +1631,7 @@ static int __init kvm_mips_init(void) register_die_notifier(&kvm_mips_csr_die_notifier); - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + ret = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) { unregister_die_notifier(&kvm_mips_csr_die_notifier); return ret; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 87283a0e33d8..57f4e7896d67 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1052,7 +1052,7 @@ static int kvmppc_book3s_init(void) { int r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) return r; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 0ea61190ec04..b0f695428733 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -531,7 +531,7 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 795667f7ebf0..611532a0dedc 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -404,7 +404,7 @@ static int __init kvmppc_e500mc_init(void) */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500mc.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 01d0f9935e6c..f5b4ff6bfc89 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -440,11 +440,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index be951feee27a..e2da56ed9069 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,11 +20,6 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; @@ -110,7 +105,7 @@ static int __init riscv_kvm_init(void) kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); } module_init(riscv_kvm_init); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 25b08b956888..7ad8252e92c2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -262,11 +262,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -5716,7 +5711,7 @@ static int __init kvm_s390_init(void) if (r) return r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) { __kvm_s390_exit(); return r; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e55f8e54e25a..6db44797e85e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5109,8 +5109,8 @@ static int __init svm_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(NULL, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e996034809de..3df6b771d816 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8635,8 +8635,8 @@ static int __init vmx_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(NULL, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d94e6e8692cc..17cbd17c9077 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12084,11 +12084,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 62d977bb1448..20e207e4c6c1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -956,8 +956,7 @@ static inline void kvm_irqfd_exit(void) { } #endif -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module); +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); @@ -1444,7 +1443,6 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); -int kvm_arch_check_processor_compat(void *opaque); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 597ff734b312..e13b369cfc1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5902,36 +5902,14 @@ void kvm_unregister_perf_callbacks(void) } #endif -struct kvm_cpu_compat_check { - void *opaque; - int *ret; -}; - -static void check_processor_compat(void *data) +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { - struct kvm_cpu_compat_check *c = data; - - *c->ret = kvm_arch_check_processor_compat(c->opaque); -} - -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) -{ - struct kvm_cpu_compat_check c; int r; int cpu; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - c.ret = &r; - c.opaque = opaque; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_processor_compat, &c, 1); - if (r < 0) - goto out_free_2; - } - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", kvm_starting_cpu, kvm_dying_cpu); if (r) -- cgit v1.2.3 From 08a9d59c6ad95ea05369944f34eb59ff639cbbab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:17 +0000 Subject: KVM: x86: Use KBUILD_MODNAME to specify vendor module name Use KBUILD_MODNAME to specify the vendor module name instead of manually writing out the name to make it a bit more obvious that the name isn't completely arbitrary. A future patch will also use KBUILD_MODNAME to define pr_fmt, at which point using KBUILD_MODNAME for kvm_x86_ops.name further reinforces the intended usage of kvm_x86_ops.name. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-34-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6db44797e85e..2bb62c3db3b3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4701,7 +4701,7 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { - .name = "kvm_amd", + .name = KBUILD_MODNAME, .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3df6b771d816..daab447d311c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8145,7 +8145,7 @@ static void vmx_vm_destroy(struct kvm *kvm) } static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = "kvm_intel", + .name = KBUILD_MODNAME, .hardware_unsetup = vmx_hardware_unsetup, -- cgit v1.2.3 From 8d20bd6381670382669d9bb39b5fd566a84cdbef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:18 +0000 Subject: KVM: x86: Unify pr_fmt to use module name for all KVM modules Define pr_fmt using KBUILD_MODNAME for all KVM x86 code so that printks use consistent formatting across common x86, Intel, and AMD code. In addition to providing consistent print formatting, using KBUILD_MODNAME, e.g. kvm_amd and kvm_intel, allows referencing SVM and VMX (and SEV and SGX and ...) as technologies without generating weird messages, and without causing naming conflicts with other kernel code, e.g. "SEV: ", "tdx: ", "sgx: " etc.. are all used by the kernel for non-KVM subsystems. Opportunistically move away from printk() for prints that need to be modified anyways, e.g. to drop a manual "kvm: " prefix. Opportunistically convert a few SGX WARNs that are similarly modified to WARN_ONCE; in the very unlikely event that the WARNs fire, odds are good that they would fire repeatedly and spam the kernel log without providing unique information in each print. Note, defining pr_fmt yields undesirable results for code that uses KVM's printk wrappers, e.g. vcpu_unimpl(). But, that's a pre-existing problem as SVM/kvm_amd already defines a pr_fmt, and thankfully use of KVM's wrappers is relatively limited in KVM x86 code. Signed-off-by: Sean Christopherson Reviewed-by: Paul Durrant Message-Id: <20221130230934.1014142-35-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 14 +++++++------- arch/arm64/include/asm/kvm_mmu.h | 4 ++-- arch/arm64/kvm/arch_timer.c | 2 +- arch/arm64/kvm/mmu.c | 12 ++++++------ arch/arm64/kvm/reset.c | 8 ++++---- arch/arm64/kvm/sys_regs.c | 6 +++--- arch/arm64/kvm/vmid.c | 6 +++--- arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/debugfs.c | 2 ++ arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/hyperv.c | 1 + arch/x86/kvm/i8254.c | 4 ++-- arch/x86/kvm/i8259.c | 4 +++- arch/x86/kvm/ioapic.c | 1 + arch/x86/kvm/irq.c | 1 + arch/x86/kvm/irq_comm.c | 7 ++++--- arch/x86/kvm/kvm_onhyperv.c | 1 + arch/x86/kvm/lapic.c | 8 ++++---- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/mmu/page_track.c | 1 + arch/x86/kvm/mmu/spte.c | 4 ++-- arch/x86/kvm/mmu/spte.h | 4 ++-- arch/x86/kvm/mmu/tdp_iter.c | 1 + arch/x86/kvm/mmu/tdp_mmu.c | 1 + arch/x86/kvm/mtrr.c | 1 + arch/x86/kvm/pmu.c | 1 + arch/x86/kvm/smm.c | 1 + arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/pmu.c | 2 ++ arch/x86/kvm/svm/sev.c | 1 + arch/x86/kvm/svm/svm.c | 10 +++++----- arch/x86/kvm/svm/svm_onhyperv.c | 1 + arch/x86/kvm/svm/svm_onhyperv.h | 4 ++-- arch/x86/kvm/vmx/hyperv.c | 3 +-- arch/x86/kvm/vmx/hyperv.h | 4 +--- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/pmu_intel.c | 5 +++-- arch/x86/kvm/vmx/posted_intr.c | 2 ++ arch/x86/kvm/vmx/sgx.c | 5 +++-- arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmx.c | 40 +++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx_ops.h | 4 ++-- arch/x86/kvm/x86.c | 28 ++++++++++++++------------- arch/x86/kvm/xen.c | 1 + include/kvm/arm_arch_timer.h | 2 +- 46 files changed, 124 insertions(+), 99 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f50951c51d3b..113e20fdbb56 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,8 +66,8 @@ enum kvm_mode kvm_get_mode(void); DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -extern unsigned int kvm_sve_max_vl; -int kvm_arm_init_sve(void); +extern unsigned int __ro_after_init kvm_sve_max_vl; +int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -877,7 +877,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); -int kvm_sys_reg_table_init(void); +int __init kvm_sys_reg_table_init(void); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -908,9 +908,9 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -extern unsigned int kvm_arm_vmid_bits; -int kvm_arm_vmid_alloc_init(void); -void kvm_arm_vmid_alloc_free(void); +extern unsigned int __ro_after_init kvm_arm_vmid_bits; +int __init kvm_arm_vmid_alloc_init(void); +void __init kvm_arm_vmid_alloc_free(void); void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); @@ -993,7 +993,7 @@ static inline void kvm_clr_pmu_events(u32 clr) {} void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu); -int kvm_set_ipa_limit(void); +int __init kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e4a7e6369499..7f7c1231679e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -163,7 +163,7 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **haddr); int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); -void free_hyp_pgds(void); +void __init free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); @@ -175,7 +175,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -int kvm_mmu_init(u32 *hyp_va_bits); +int __init kvm_mmu_init(u32 *hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 33fca1a691a5..23346585a294 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1113,7 +1113,7 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } -int kvm_timer_hyp_init(bool has_gic) +int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 31d7fa4c7c14..d6c41043b5af 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -25,11 +25,11 @@ static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; +static unsigned long __ro_after_init hyp_idmap_start; +static unsigned long __ro_after_init hyp_idmap_end; +static phys_addr_t __ro_after_init hyp_idmap_vector; -static unsigned long io_map_base; +static unsigned long __ro_after_init io_map_base; static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) { @@ -280,7 +280,7 @@ static void stage2_flush_vm(struct kvm *kvm) /** * free_hyp_pgds - free Hyp-mode page tables */ -void free_hyp_pgds(void) +void __init free_hyp_pgds(void) { mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgtable) { @@ -1665,7 +1665,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .virt_to_phys = kvm_host_pa, }; -int kvm_mmu_init(u32 *hyp_va_bits) +int __init kvm_mmu_init(u32 *hyp_va_bits) { int err; u32 idmap_bits; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e0267f672b8a..2bc74739a6df 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -30,7 +30,7 @@ #include /* Maximum phys_shift supported for any VM on this host */ -static u32 kvm_ipa_limit; +static u32 __ro_after_init kvm_ipa_limit; /* * ARMv8 Reset Values @@ -41,9 +41,9 @@ static u32 kvm_ipa_limit; #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int kvm_sve_max_vl; +unsigned int __ro_after_init kvm_sve_max_vl; -int kvm_arm_init_sve(void) +int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); @@ -352,7 +352,7 @@ u32 get_kvm_ipa_limit(void) return kvm_ipa_limit; } -int kvm_set_ipa_limit(void) +int __init kvm_set_ipa_limit(void) { unsigned int parange; u64 mmfr0; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d5ee52d6bf73..1225e9faac06 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -82,7 +82,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) } /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; +static u32 __ro_after_init cache_levels; /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 @@ -2733,7 +2733,7 @@ static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) } /* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] = { +static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 }, @@ -3057,7 +3057,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -int kvm_sys_reg_table_init(void) +int __init kvm_sys_reg_table_init(void) { bool valid = true; unsigned int i; diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index d78ae63d7c15..08978d0672e7 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -16,7 +16,7 @@ #include #include -unsigned int kvm_arm_vmid_bits; +unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; @@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) /* * Initialize the VMID allocator */ -int kvm_arm_vmid_alloc_init(void) +int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); @@ -190,7 +190,7 @@ int kvm_arm_vmid_alloc_init(void) return 0; } -void kvm_arm_vmid_alloc_free(void) +void __init kvm_arm_vmid_alloc_free(void) { kfree(vmid_map); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 69768e4d53a6..4ca16b18a086 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -8,6 +8,7 @@ * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index c1390357126a..ee8c4c3496ed 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -4,6 +4,8 @@ * * Copyright 2016 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include "lapic.h" diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5cc3efa0e21c..c3443045cd93 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -17,6 +17,7 @@ * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include "kvm_cache_regs.h" diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 80d082ecd5c5..71aff0edc0ed 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -17,6 +17,7 @@ * Ben-Ami Yassour * Andrey Smetanin */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e0a7a0e7a73c..cd57a517d04a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -30,7 +30,7 @@ * Based on QEMU and Xen. */ -#define pr_fmt(fmt) "pit: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) if (ps->period < min_period) { pr_info_ratelimited( - "kvm: requested %lld ns " + "requested %lld ns " "i8254 timer period limited to %lld ns\n", ps->period, min_period); ps->period = min_period; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e1bb6218bb96..4756bcb5724f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,8 @@ * Yaozu (Eddie) Dong * Port from Qemu. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -35,7 +37,7 @@ #include "trace.h" #define pr_pic_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__) static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 765943d7cfa5..042dee556125 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -26,6 +26,7 @@ * Yaozu (Eddie) Dong * Based on Xen 3.1 code. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a70952eca905..b2c397dd2bc6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -7,6 +7,7 @@ * Authors: * Yaozu (Eddie) Dong */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 3742d9adacfc..16d076a1b91a 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -8,6 +8,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + pr_info("apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); if (irq_source_id >= BITS_PER_LONG) { - printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + pr_warn("exhausted allocatable IRQ sources!\n"); irq_source_id = -EFAULT; goto unlock; } @@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= BITS_PER_LONG) { - printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + pr_err("IRQ source ID out of range!\n"); goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ee4f696a0782..482d6639ef88 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4efdb4a4d72c..cfaf1d8c64ca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -15,6 +15,7 @@ * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -941,8 +942,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; - printk(KERN_INFO - "Disabled LAPIC found during irq injection\n"); + pr_info("Disabled LAPIC found during irq injection\n"); } } @@ -1560,7 +1560,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) if (apic->lapic_timer.period < min_period) { pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " + "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); @@ -1845,7 +1845,7 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) deadline = apic->lapic_timer.period; else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( - "kvm: vcpu %i: requested lapic timer restore with " + "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9c7198544195..aeb240b339f5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -14,6 +14,7 @@ * Yaniv Kamay * Avi Kivity */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" @@ -3456,8 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (++retry_count > 4) { - printk_once(KERN_WARNING - "kvm: Fast #PF retrying more than 4 times.\n"); + pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } @@ -6647,7 +6647,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * zap all shadow pages. */ if (unlikely(gen == 0)) { - kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 2e09d1b6249f..0a2ac438d647 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -10,6 +10,7 @@ * Author: * Xiao Guangrong */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c0fd7e049b4e..fce6f047399f 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -7,7 +7,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include "mmu.h" @@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte) WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), - "kvm: Access Tracking saved bit locations are not zero\n"); + "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6f54dc9409c9..0d8deefee66c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), - "kvm: MMU-writable SPTE is not Host-writable: %llx", + KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), - "kvm: Writable SPTE is not MMU-writable: %llx", spte); + KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 39b48e7d7d1a..e26e744df1d1 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cc1fb9a65620..bba33aea0fb0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a8502e02f479..9fac1ec03463 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -13,6 +13,7 @@ * Paolo Bonzini * Xiao Guangrong */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb594620dd75..d939d3b84e6f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -9,6 +9,7 @@ * Gleb Natapov * Wei Huang */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index a9c1c2af8d94..cc43638d48a3 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include "x86.h" diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6919dee69f18..f52f5e0dd465 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -12,7 +12,7 @@ * Avi Kivity */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bc9cd7086fa9..3bfbcb607d80 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -12,7 +12,7 @@ * Avi Kivity */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0e313fbae055..1ff068f23841 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -9,6 +9,8 @@ * * Implementation is based on pmu_intel.c file */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86d6897f4806..273cba809328 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -6,6 +6,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2bb62c3db3b3..f7192a14fc89 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,4 +1,4 @@ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -2076,7 +2076,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("Guest triggered AMD Erratum 383\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -4629,7 +4629,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); /* * If the fault occurred in userspace, arbitrarily inject #GP @@ -4978,7 +4978,7 @@ static __init int svm_hardware_setup(void) } if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } @@ -4996,7 +4996,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 26a89d0da93e..7af8422d3382 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V for SVM. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 45faf84476ce..6981c1e9a809 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -34,7 +34,7 @@ static inline void svm_hv_hardware_setup(void) { if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; svm_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; @@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void) if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { int cpu; - pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); for_each_online_cpu(cpu) { struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(cpu); diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index f773450fba6e..2a26a0f27d48 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 - -#define pr_fmt(fmt) "kvm/hyper-v: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 883102d567c3..e03692958408 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -179,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, { int offset = evmcs_field_offset(field, clean_field); - WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", - field); - + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); return offset; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d93c715cda6a..557b9c468734 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e5cec07ca8d9..efce9ad70e4e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -8,6 +8,8 @@ * Avi Kivity * Gleb Natapov */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -762,8 +764,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) return; warn: - pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", - vcpu->vcpu_id); + pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id); } static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..94c38bea60e7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b12da2a6dec9..aa53c98034bf 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2021 Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.sgx_provisioning_allowed && (attributes & SGX_ATTR_PROVISIONKEY)) { if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) - pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n"); kvm_inject_gp(vcpu, 0); return 1; } @@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu) return handle_encls_ecreate(vcpu); if (leaf == EINIT) return handle_encls_einit(vcpu); - WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; return 0; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 2251b60920f8..106a72c923ca 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index daab447d311c..58eb4dbf64a3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -12,6 +12,7 @@ * Avi Kivity * Yaniv Kamay */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault) if (fault) kvm_spurious_fault(); else - vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); + vmx_insn_failed("vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { - vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { - vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } @@ -577,7 +578,7 @@ static __init void hv_init_evmcs(void) } if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + pr_info("Using Hyper-V Enlightened VMCS\n"); static_branch_enable(&enable_evmcs); } @@ -1680,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!instr_len) goto rip_updated; - WARN(exit_reason.enclave_mode, - "KVM: skipping instruction after SGX enclave VM-Exit"); + WARN_ONCE(exit_reason.enclave_mode, + "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; @@ -3024,9 +3025,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) var.type = 0x3; var.avl = 0; if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); + pr_warn_once("segment base is not paragraph aligned " + "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); @@ -3056,8 +3056,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); + pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); vmx_segment_cache_clear(vmx); @@ -6925,7 +6924,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, - "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); @@ -7530,7 +7529,7 @@ static int __init vmx_check_processor_compat(void) if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); + pr_err("VMX is disabled on CPU %d\n", smp_processor_id()); return -EIO; } @@ -7539,8 +7538,7 @@ static int __init vmx_check_processor_compat(void) if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); + pr_err("CPU %d feature inconsistency!\n", smp_processor_id()); return -EIO; } return 0; @@ -8365,7 +8363,7 @@ static __init int hardware_setup(void) return -EIO; if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); if (boot_cpu_has(X86_FEATURE_NX)) @@ -8373,7 +8371,7 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) @@ -8392,7 +8390,7 @@ static __init int hardware_setup(void) /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 842dc898c972..a5282014616c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: - WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); - pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); return 0; do_exception: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 17cbd17c9077..4c40bbec372a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,6 +15,7 @@ * Amit Shah * Ben-Ami Yassour */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include "irq.h" @@ -2087,7 +2088,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); - pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); + pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } int kvm_emulate_mwait(struct kvm_vcpu *vcpu) @@ -2434,7 +2435,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); + pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", + user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); @@ -7702,7 +7704,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + pr_warn_once("emulating exchange as write\n"); return emulator_write_emulated(ctxt, addr, new, bytes, exception); } @@ -8263,7 +8265,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); if (!ctxt) { - pr_err("kvm: failed to allocate vcpu's emulator\n"); + pr_err("failed to allocate vcpu's emulator\n"); return NULL; } @@ -9324,17 +9326,17 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) int r, cpu; if (kvm_x86_ops.hardware_enable) { - pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); + pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support for '%s'\n", + pr_err_ratelimited("no hardware support for '%s'\n", ops->runtime_ops->name); return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", + pr_err_ratelimited("support for '%s' disabled by bios\n", ops->runtime_ops->name); return -EOPNOTSUPP; } @@ -9345,7 +9347,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * vCPU's FPU state as a fxregs_state struct. */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { - printk(KERN_ERR "kvm: inadequate fpu\n"); + pr_err("inadequate fpu\n"); return -EOPNOTSUPP; } @@ -9363,19 +9365,19 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) */ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { - pr_err("kvm: host PAT[0] is not WB\n"); + pr_err("host PAT[0] is not WB\n"); return -EIO; } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { - pr_err("kvm: failed to allocate cache for x86 emulator\n"); + pr_err("failed to allocate cache for x86 emulator\n"); return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + pr_err("failed to allocate percpu kvm_user_return_msrs\n"); r = -ENOMEM; goto out_free_x86_emulator_cache; } @@ -11647,7 +11649,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && kvm->created_vcpus) - pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); if (!kvm->arch.max_vcpu_ids) @@ -11724,7 +11726,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_wbinvd_dirty_mask; if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); + pr_err("failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2e29bdc2949c..0bd5d171df57 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -5,6 +5,7 @@ * * KVM Xen emulation */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 1638418f72dd..71916de7c6c4 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -60,7 +60,7 @@ struct arch_timer_cpu { bool enabled; }; -int kvm_timer_hyp_init(bool); +int __init kvm_timer_hyp_init(bool has_gic); int kvm_timer_enable(struct kvm_vcpu *vcpu); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 462689b37f082d06c2cf32a6e4e142182825dd70 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:19 +0000 Subject: KVM: VMX: Use current CPU's info to perform "disabled by BIOS?" checks Use this_cpu_has() instead of boot_cpu_has() to perform the effective "disabled by BIOS?" checks for VMX. This will allow consolidating code between vmx_disabled_by_bios() and vmx_check_processor_compat(). Checking the boot CPU isn't a strict requirement as any divergence in VMX enabling between the boot CPU and other CPUs will result in KVM refusing to load thanks to the aforementioned vmx_check_processor_compat(). Furthermore, using the boot CPU was an unintentional change introduced by commit a4d0b2fdbcf7 ("KVM: VMX: Use VMX feature flag to query BIOS enabling"). Prior to using the feature flags, KVM checked the raw MSR value from the current CPU. Reported-by: Kai Huang Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-36-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 58eb4dbf64a3..a3c681336a39 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2523,8 +2523,8 @@ static __init int cpu_has_kvm_support(void) static __init int vmx_disabled_by_bios(void) { - return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !boot_cpu_has(X86_FEATURE_VMX); + return !this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX); } static int kvm_cpu_vmxon(u64 vmxon_pointer) -- cgit v1.2.3 From d41931324975741e570b8deb7ea58feeabf39343 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:20 +0000 Subject: KVM: x86: Do VMX/SVM support checks directly in vendor code Do basic VMX/SVM support checks directly in vendor code instead of implementing them via kvm_x86_ops hooks. Beyond the superficial benefit of providing common messages, which isn't even clearly a net positive since vendor code can provide more precise/detailed messages, there's zero advantage to bouncing through common x86 code. Consolidating the checks will also simplify performing the checks across all CPUs (in a future patch). Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-37-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/svm/svm.c | 38 +++++++++++++++++--------------------- arch/x86/kvm/vmx/vmx.c | 37 ++++++++++++++++++++----------------- arch/x86/kvm/x86.c | 11 ----------- 4 files changed, 37 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4899505955b5..f20a30054688 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1722,8 +1722,6 @@ struct kvm_x86_nested_ops { }; struct kvm_x86_init_ops { - int (*cpu_has_kvm_support)(void); - int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f7192a14fc89..177ee63c8197 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -519,21 +519,28 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static int has_svm(void) +static bool kvm_is_svm_supported(void) { const char *msg; + u64 vm_cr; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; + pr_err("SVM not supported, %s\n", msg); + return false; } if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); - return 0; + return false; } - return 1; + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { + pr_err("SVM disabled (by BIOS) in MSR_VM_CR\n"); + return false; + } + + return true; } void __svm_write_tsc_multiplier(u64 multiplier) @@ -572,10 +579,9 @@ static int svm_hardware_enable(void) if (efer & EFER_SVME) return -EBUSY; - if (!has_svm()) { - pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); + if (!kvm_is_svm_supported()) return -EINVAL; - } + sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; @@ -4076,17 +4082,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcb_mark_dirty(svm->vmcb, VMCB_CR); } -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - static void svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { @@ -5086,8 +5081,6 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, .check_processor_compatibility = svm_check_processor_compat, @@ -5101,6 +5094,9 @@ static int __init svm_init(void) __unused_size_checks(); + if (!kvm_is_svm_supported()) + return -EOPNOTSUPP; + r = kvm_x86_vendor_init(&svm_init_ops); if (r) return r; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a3c681336a39..08b1b7092429 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2516,17 +2516,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - return !this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX); -} - static int kvm_cpu_vmxon(u64 vmxon_pointer) { u64 msr; @@ -7522,16 +7511,29 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } +static bool __init kvm_is_vmx_supported(void) +{ + if (!cpu_has_vmx()) { + pr_err("CPU doesn't support VMX\n"); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL\n"); + return false; + } + + return true; +} + static int __init vmx_check_processor_compat(void) { struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("VMX is disabled on CPU %d\n", smp_processor_id()); + if (!kvm_is_vmx_supported()) return -EIO; - } if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) return -EIO; @@ -8542,8 +8544,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, @@ -8586,6 +8586,9 @@ static int __init vmx_init(void) { int r, cpu; + if (!kvm_is_vmx_supported()) + return -EOPNOTSUPP; + /* * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing * to unwind if a later step fails. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c40bbec372a..cad48a80697f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9330,17 +9330,6 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -EEXIST; } - if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("no hardware support for '%s'\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - if (ops->disabled_by_bios()) { - pr_err_ratelimited("support for '%s' disabled by bios\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - /* * KVM explicitly assumes that the guest has an FPU and * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the -- cgit v1.2.3 From 8504ef2139e2164aced4335c23ef3f919db55281 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:21 +0000 Subject: KVM: VMX: Shuffle support checks and hardware enabling code around Reorder code in vmx.c so that the VMX support check helpers reside above the hardware enabling helpers, which will allow KVM to perform support checks during hardware enabling (in a future patch). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-38-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 216 ++++++++++++++++++++++++------------------------- 1 file changed, 108 insertions(+), 108 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 08b1b7092429..a39c6b6e5d94 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2516,79 +2516,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - -static int vmx_hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - -static void vmx_hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - - if (cpu_vmxoff()) - kvm_spurious_fault(); - - hv_reset_evmcs(); - - intel_pt_handle_vmx(0); -} - /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping @@ -2819,6 +2746,114 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } +static bool __init kvm_is_vmx_supported(void) +{ + if (!cpu_has_vmx()) { + pr_err("CPU doesn't support VMX\n"); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL\n"); + return false; + } + + return true; +} + +static int __init vmx_check_processor_compat(void) +{ + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!kvm_is_vmx_supported()) + return -EIO; + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) + return -EIO; + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + pr_err("CPU %d feature inconsistency!\n", smp_processor_id()); + return -EIO; + } + return 0; +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + + return 0; +} + +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); @@ -7511,41 +7546,6 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static bool __init kvm_is_vmx_supported(void) -{ - if (!cpu_has_vmx()) { - pr_err("CPU doesn't support VMX\n"); - return false; - } - - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL\n"); - return false; - } - - return true; -} - -static int __init vmx_check_processor_compat(void) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - if (!kvm_is_vmx_supported()) - return -EIO; - - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - return -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - pr_err("CPU %d feature inconsistency!\n", smp_processor_id()); - return -EIO; - } - return 0; -} - static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; -- cgit v1.2.3 From 325fc9579c2e415e8ae3d1ab56a5157751157869 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:22 +0000 Subject: KVM: SVM: Check for SVM support in CPU compatibility checks Check that SVM is supported and enabled in the processor compatibility checks. SVM already checks for support during hardware enabling, i.e. this doesn't really add new functionality. The net effect is that KVM will refuse to load if a CPU doesn't have SVM fully enabled, as opposed to failing KVM_CREATE_VM. Opportunistically move svm_check_processor_compat() up in svm.c so that it can be invoked during hardware enabling in a future patch. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-39-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 177ee63c8197..3def932a809f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -543,6 +543,14 @@ static bool kvm_is_svm_supported(void) return true; } +static int __init svm_check_processor_compat(void) +{ + if (!kvm_is_svm_supported()) + return -EIO; + + return 0; +} + void __svm_write_tsc_multiplier(u64 multiplier) { preempt_disable(); @@ -4093,11 +4101,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static int __init svm_check_processor_compat(void) -{ - return 0; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. -- cgit v1.2.3 From d83420c2d74e660f207a708f6574e952652a4698 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:23 +0000 Subject: KVM: x86: Move CPU compat checks hook to kvm_x86_ops (from kvm_x86_init_ops) Move the .check_processor_compatibility() callback from kvm_x86_init_ops to kvm_x86_ops to allow a future patch to do compatibility checks during CPU hotplug. Do kvm_ops_update() before compat checks so that static_call() can be used during compat checks. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-40-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/vmx/hyperv.c | 2 +- arch/x86/kvm/vmx/hyperv.h | 2 +- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- arch/x86/kvm/x86.c | 31 +++++++++++-------------------- 7 files changed, 27 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index abccd51dcfca..dba2909e5ae2 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,6 +14,7 @@ BUILD_BUG_ON(1) * to make a definition optional, but in this case the default will * be __static_call_return0. */ +KVM_X86_OP(check_processor_compatibility) KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) KVM_X86_OP(hardware_unsetup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f20a30054688..6a9d6f99d0d3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1511,6 +1511,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) struct kvm_x86_ops { const char *name; + int (*check_processor_compatibility)(void); + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1722,7 +1724,6 @@ struct kvm_x86_nested_ops { }; struct kvm_x86_init_ops { - int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3def932a809f..b106cbb24c4d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -543,7 +543,7 @@ static bool kvm_is_svm_supported(void) return true; } -static int __init svm_check_processor_compat(void) +static int svm_check_processor_compat(void) { if (!kvm_is_svm_supported()) return -EIO; @@ -4701,6 +4701,8 @@ static int svm_vm_init(struct kvm *kvm) static struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, + .check_processor_compatibility = svm_check_processor_compat, + .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, @@ -5085,7 +5087,6 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { .hardware_setup = svm_hardware_setup, - .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, .pmu_ops = &amd_pmu_ops, diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 2a26a0f27d48..22daca752797 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -526,7 +526,7 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) } \ while (0) -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index e03692958408..ab08a9b9ab7d 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -271,7 +271,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a39c6b6e5d94..3d5e93ceed6b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2551,8 +2551,7 @@ static bool cpu_has_perf_global_ctrl_bug(void) return false; } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -2570,7 +2569,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; @@ -2579,8 +2578,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, - struct vmx_capability *vmx_cap) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; @@ -2746,7 +2745,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -static bool __init kvm_is_vmx_supported(void) +static bool kvm_is_vmx_supported(void) { if (!cpu_has_vmx()) { pr_err("CPU doesn't support VMX\n"); @@ -2762,7 +2761,7 @@ static bool __init kvm_is_vmx_supported(void) return true; } -static int __init vmx_check_processor_compat(void) +static int vmx_check_processor_compat(void) { struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; @@ -8147,6 +8146,8 @@ static void vmx_vm_destroy(struct kvm *kvm) static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = KBUILD_MODNAME, + .check_processor_compatibility = vmx_check_processor_compat, + .hardware_unsetup = vmx_hardware_unsetup, .hardware_enable = vmx_hardware_enable, @@ -8544,7 +8545,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cad48a80697f..9870a746364f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9294,12 +9294,7 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) kvm_pmu_ops_update(ops->pmu_ops); } -struct kvm_cpu_compat_check { - struct kvm_x86_init_ops *ops; - int *ret; -}; - -static int kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops *ops) +static int kvm_x86_check_processor_compatibility(void) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); @@ -9309,19 +9304,16 @@ static int kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops *ops) __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; - return ops->check_processor_compatibility(); + return static_call(kvm_x86_check_processor_compatibility)(); } -static void kvm_x86_check_cpu_compat(void *data) +static void kvm_x86_check_cpu_compat(void *ret) { - struct kvm_cpu_compat_check *c = data; - - *c->ret = kvm_x86_check_processor_compatibility(c->ops); + *(int *)ret = kvm_x86_check_processor_compatibility(); } static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { - struct kvm_cpu_compat_check c; u64 host_pat; int r, cpu; @@ -9392,12 +9384,12 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r != 0) goto out_mmu_exit; - c.ret = &r; - c.ops = ops; + kvm_ops_update(ops); + for_each_online_cpu(cpu) { - smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &c, 1); + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1); if (r < 0) - goto out_hardware_unsetup; + goto out_unwind_ops; } /* @@ -9405,8 +9397,6 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * absolutely necessary, as most operations from this point forward * require unwinding. */ - kvm_ops_update(ops); - kvm_timer_init(); if (pi_inject_timer == -1) @@ -9442,8 +9432,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_msr_list(); return 0; -out_hardware_unsetup: - ops->runtime_ops->hardware_unsetup(); +out_unwind_ops: + kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); out_free_percpu: -- cgit v1.2.3 From c82a5c5c53c5b732a9610736841a117373e43adb Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:24 +0000 Subject: KVM: x86: Do compatibility checks when onlining CPU Do compatibility checks when enabling hardware to effectively add compatibility checks when onlining a CPU. Abort enabling, i.e. the online process, if the (hotplugged) CPU is incompatible with the known good setup. At init time, KVM does compatibility checks to ensure that all online CPUs support hardware virtualization and a common set of features. But KVM uses hotplugged CPUs without such compatibility checks. On Intel CPUs, this leads to #GP if the hotplugged CPU doesn't support VMX, or VM-Entry failure if the hotplugged CPU doesn't support all features enabled by KVM. Note, this is little more than a NOP on SVM, as SVM already checks for full SVM support during hardware enabling. Opportunistically add a pr_err() if setup_vmcs_config() fails, and tweak all error messages to output which CPU failed. Signed-off-by: Chao Gao Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Acked-by: Kai Huang Message-Id: <20221130230934.1014142-41-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 8 +++----- arch/x86/kvm/vmx/vmx.c | 15 ++++++++++----- arch/x86/kvm/x86.c | 5 +++++ 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b106cbb24c4d..799b24801d31 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -521,11 +521,12 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) static bool kvm_is_svm_supported(void) { + int cpu = raw_smp_processor_id(); const char *msg; u64 vm_cr; if (!cpu_has_svm(&msg)) { - pr_err("SVM not supported, %s\n", msg); + pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); return false; } @@ -536,7 +537,7 @@ static bool kvm_is_svm_supported(void) rdmsrl(MSR_VM_CR, vm_cr); if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { - pr_err("SVM disabled (by BIOS) in MSR_VM_CR\n"); + pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); return false; } @@ -587,9 +588,6 @@ static int svm_hardware_enable(void) if (efer & EFER_SVME) return -EBUSY; - if (!kvm_is_svm_supported()) - return -EINVAL; - sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3d5e93ceed6b..73005d7e4e43 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2747,14 +2747,16 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, static bool kvm_is_vmx_supported(void) { + int cpu = raw_smp_processor_id(); + if (!cpu_has_vmx()) { - pr_err("CPU doesn't support VMX\n"); + pr_err("VMX not supported by CPU %d\n", cpu); return false; } if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL\n"); + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); return false; } @@ -2763,18 +2765,21 @@ static bool kvm_is_vmx_supported(void) static int vmx_check_processor_compat(void) { + int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; if (!kvm_is_vmx_supported()) return -EIO; - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { + pr_err("Failed to setup VMCS config on CPU %d\n", cpu); return -EIO; + } if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - pr_err("CPU %d feature inconsistency!\n", smp_processor_id()); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { + pr_err("Inconsistent VMCS config on CPU %d\n", cpu); return -EIO; } return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9870a746364f..6575d9e7b9b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11980,6 +11980,11 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); + + ret = kvm_x86_check_processor_compatibility(); + if (ret) + return ret; + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; -- cgit v1.2.3 From aaf12a7b4323eb7d94677bcefc286ff6b772ed1c Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:25 +0000 Subject: KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section The CPU STARTING section doesn't allow callbacks to fail. Move KVM's hotplug callback to ONLINE section so that it can abort onlining a CPU in certain cases to avoid potentially breaking VMs running on existing CPUs. For example, when KVM fails to enable hardware virtualization on the hotplugged CPU. Place KVM's hotplug state before CPUHP_AP_SCHED_WAIT_EMPTY as it ensures when offlining a CPU, all user tasks and non-pinned kernel tasks have left the CPU, i.e. there cannot be a vCPU task around. So, it is safe for KVM's CPU offline callback to disable hardware virtualization at that point. Likewise, KVM's online callback can enable hardware virtualization before any vCPU task gets a chance to run on hotplugged CPUs. Drop kvm_x86_check_processor_compatibility()'s WARN that IRQs are disabled, as the ONLINE section runs with IRQs disabled. The WARN wasn't intended to be a requirement, e.g. disabling preemption is sufficient, the IRQ thing was purely an aggressive sanity check since the helper was only ever invoked via SMP function call. Rename KVM's CPU hotplug callbacks accordingly. Suggested-by: Thomas Gleixner Signed-off-by: Chao Gao Signed-off-by: Isaku Yamahata Reviewed-by: Yuan Yao [sean: drop WARN that IRQs are disabled] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-42-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- include/linux/cpuhotplug.h | 2 +- virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++-------- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6575d9e7b9b6..f2971821ec26 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9298,8 +9298,6 @@ static int kvm_x86_check_processor_compatibility(void) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - WARN_ON(!irqs_disabled()); - if (__cr4_reserved_bits(cpu_has, c) != __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5cae6bd22f7f..5b2f8147d1ae 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -187,7 +187,6 @@ enum cpuhp_state { CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_TI_GP_TIMER_STARTING, CPUHP_AP_HYPERV_TIMER_STARTING, - CPUHP_AP_KVM_STARTING, /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, @@ -202,6 +201,7 @@ enum cpuhp_state { /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_KVM_ONLINE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e13b369cfc1b..ee1005cb99e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5114,13 +5114,27 @@ static void hardware_enable_nolock(void *junk) } } -static int kvm_starting_cpu(unsigned int cpu) +static int kvm_online_cpu(unsigned int cpu) { + int ret = 0; + + /* + * Abort the CPU online process if hardware virtualization cannot + * be enabled. Otherwise running VMs would encounter unrecoverable + * errors when scheduled to this CPU. + */ raw_spin_lock(&kvm_count_lock); - if (kvm_usage_count) + if (kvm_usage_count) { + WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); + hardware_enable_nolock(NULL); + if (atomic_read(&hardware_enable_failed)) { + atomic_set(&hardware_enable_failed, 0); + ret = -EIO; + } + } raw_spin_unlock(&kvm_count_lock); - return 0; + return ret; } static void hardware_disable_nolock(void *junk) @@ -5133,7 +5147,7 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(); } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu) { raw_spin_lock(&kvm_count_lock); if (kvm_usage_count) @@ -5910,8 +5924,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", - kvm_starting_cpu, kvm_dying_cpu); + r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); if (r) goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); @@ -5985,7 +5999,7 @@ out_free_4: kmem_cache_destroy(kvm_vcpu_cache); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); out_free_2: free_cpumask_var(cpus_hardware_enabled); return r; @@ -6011,7 +6025,7 @@ void kvm_exit(void) kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); -- cgit v1.2.3 From e4aa7f88af1a123863530af4a238ce64ec8fad5a Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:26 +0000 Subject: KVM: Disable CPU hotplug during hardware enabling/disabling Disable CPU hotplug when enabling/disabling hardware to prevent the corner case where if the following sequence occurs: 1. A hotplugged CPU marks itself online in cpu_online_mask 2. The hotplugged CPU enables interrupt before invoking KVM's ONLINE callback 3 hardware_{en,dis}able_all() is invoked on another CPU the hotplugged CPU will be included in on_each_cpu() and thus get sent through hardware_{en,dis}able_nolock() before kvm_online_cpu() is called. start_secondary { ... set_cpu_online(smp_processor_id(), true); <- 1 ... local_irq_enable(); <- 2 ... cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); <- 3 } KVM currently fudges around this race by keeping track of which CPUs have done hardware enabling (see commit 1b6c016818a5 "KVM: Keep track of which cpus have virtualization enabled"), but that's an inefficient, convoluted, and hacky solution. Signed-off-by: Chao Gao [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-43-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++++++- virt/kvm/kvm_main.c | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f2971821ec26..c3ac88036b52 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9296,7 +9296,16 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) static int kvm_x86_check_processor_compatibility(void) { - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); if (__cr4_reserved_bits(cpu_has, c) != __cr4_reserved_bits(cpu_has, &boot_cpu_data)) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ee1005cb99e1..2d6203ecab88 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5167,15 +5167,26 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); } static int hardware_enable_all(void) { int r = 0; + /* + * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() + * is called, and so on_each_cpu() between them includes the CPU that + * is being onlined. As a result, hardware_enable_nolock() may get + * invoked before kvm_online_cpu(), which also enables hardware if the + * usage count is non-zero. Disable CPU hotplug to avoid attempting to + * enable hardware multiple times. + */ + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); kvm_usage_count++; @@ -5190,6 +5201,7 @@ static int hardware_enable_all(void) } raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); return r; } -- cgit v1.2.3 From 441f7bfa99fe2b8a7e504aa72047e20579e88a5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:33 +0000 Subject: KVM: Opt out of generic hardware enabling on s390 and PPC Allow architectures to opt out of the generic hardware enabling logic, and opt out on both s390 and PPC, which don't need to manually enable virtualization as it's always on (when available). In addition to letting s390 and PPC drop a bit of dead code, this will hopefully also allow ARM to clean up its related code, e.g. ARM has its own per-CPU flag to track which CPUs have enable hardware due to the need to keep hardware enabled indefinitely when pKVM is enabled. Signed-off-by: Sean Christopherson Acked-by: Anup Patel Message-Id: <20221130230934.1014142-50-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 1 + arch/mips/kvm/Kconfig | 1 + arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/Kconfig | 1 + arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 6 ------ arch/x86/kvm/Kconfig | 1 + include/linux/kvm_host.h | 4 ++++ virt/kvm/Kconfig | 3 +++ virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++++------ 11 files changed, 35 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 05da3c8f7e88..ca6eadeb7d1a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 91d197bee9c0..29e51649203b 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select MMU_NOTIFIER select SRCU select INTERVAL_TREE + select KVM_GENERIC_HARDWARE_ENABLING help Support for hosting Guest kernels. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0a80e80c7b9e..959f566a455c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -876,7 +876,6 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f5b4ff6bfc89..4c5405fc5538 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -435,11 +435,6 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, } EXPORT_SYMBOL_GPL(kvmppc_ld); -int kvm_arch_hardware_enable(void) -{ - return 0; -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f36a737d5f96..d5a658a047a7 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select KVM_MMIO diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d67ce719d16a..2bbc3d54959d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1031,7 +1031,6 @@ extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7ad8252e92c2..bd25076aa19b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -256,12 +256,6 @@ debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbeaa9ddef59..8e578311ca9d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 20e207e4c6c1..109b18e2789c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1441,8 +1441,10 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); +#endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); @@ -2074,7 +2076,9 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } } +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool kvm_rebooting; +#endif extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 9fb1ff6f19e5..b74916de5183 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,6 @@ config KVM_XFER_TO_GUEST_WORK config HAVE_KVM_PM_NOTIFIER bool + +config KVM_GENERIC_HARDWARE_ENABLING + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c0bd9a8e27e..3d0678579c5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -102,9 +102,6 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static DEFINE_PER_CPU(bool, hardware_enabled); -static int kvm_usage_count; - static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -146,9 +143,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); @@ -5093,6 +5087,13 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); + +static DEFINE_PER_CPU(bool, hardware_enabled); +static int kvm_usage_count; + static int __hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) @@ -5254,6 +5255,17 @@ static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int hardware_enable_all(void) +{ + return 0; +} + +static void hardware_disable_all(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { @@ -5942,6 +5954,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) @@ -5949,6 +5962,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) register_reboot_notifier(&kvm_reboot_notifier); register_syscore_ops(&kvm_syscore_ops); +#endif /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -6016,9 +6030,11 @@ out_free_4: free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); out_free_3: +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6040,9 +6056,11 @@ void kvm_exit(void) kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3 From 0a19807b464fb10aa79b9dd7f494bc317438fada Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:34 +0000 Subject: KVM: x86: Blindly get current x2APIC reg value on "nodecode write" traps When emulating a x2APIC write in response to an APICv/AVIC trap, get the the written value from the vAPIC page without checking that reads are allowed for the target register. AVIC can generate trap-like VM-Exits on writes to EOI, and so KVM needs to get the written value from the backing page without running afoul of EOI's write-only behavior. Alternatively, EOI could be special cased to always write '0', e.g. so that the sanity check could be preserved, but x2APIC on AMD is actually supposed to disallow non-zero writes (not emulated by KVM), and the sanity check was a byproduct of how the KVM code was written, i.e. wasn't added to guard against anything in particular. Fixes: 70c8327c11c6 ("KVM: x86: Bug the VM if an accelerated x2APIC trap occurs on a "bad" reg") Fixes: 1bd9dfec9fd4 ("KVM: x86: Do not block APIC write for non ICR registers") Reported-by: Alejandro Jimenez Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4efdb4a4d72c..5c0f93fc073a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2284,23 +2284,18 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) struct kvm_lapic *apic = vcpu->arch.apic; u64 val; - if (apic_x2apic_mode(apic)) { - if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm)) - return; - } else { - val = kvm_lapic_get_reg(apic, offset); - } - /* * ICR is a single 64-bit register when x2APIC is enabled. For legacy * xAPIC, ICR writes need to go down the common (slightly slower) path * to get the upper half from ICR2. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + val = kvm_lapic_get_reg64(apic, APIC_ICR); kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); trace_kvm_apic_write(APIC_ICR, val); } else { /* TODO: optimize to just emulate side effect w/o one more write */ + val = kvm_lapic_get_reg(apic, offset); kvm_lapic_reg_write(apic, offset, (u32)val); } } -- cgit v1.2.3 From 97a71c444a147ae41c7d0ab5b3d855d7f762f3ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:35 +0000 Subject: KVM: x86: Purge "highest ISR" cache when updating APICv state Purge the "highest ISR" cache when updating APICv state on a vCPU. The cache must not be used when APICv is active as hardware may emulate EOIs (and other operations) without exiting to KVM. This fixes a bug where KVM will effectively block IRQs in perpetuity due to the "highest ISR" never getting reset if APICv is activated on a vCPU while an IRQ is in-service. Hardware emulates the EOI and KVM never gets a chance to update its cache. Fixes: b26a695a1d78 ("kvm: lapic: Introduce APICv update helper function") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit Cc: Maxim Levitsky Reviewed-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5c0f93fc073a..33a661d82da7 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2424,6 +2424,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) */ apic->isr_count = count_vectors(apic->regs + APIC_ISR); } + apic->highest_isr_cache = -1; } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -2479,7 +2480,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } kvm_apic_update_apicv(vcpu); - apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2767,7 +2767,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) __start_apic_timer(apic, APIC_TMCCT); kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); - apic->highest_isr_cache = -1; if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); -- cgit v1.2.3 From 0ccf3e7cb95a2db8ddb2a44812037ffba8166dc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:36 +0000 Subject: KVM: SVM: Flush the "current" TLB when activating AVIC Flush the TLB when activating AVIC as the CPU can insert into the TLB while AVIC is "locally" disabled. KVM doesn't treat "APIC hardware disabled" as VM-wide AVIC inhibition, and so when a vCPU has its APIC hardware disabled, AVIC is not guaranteed to be inhibited. As a result, KVM may create a valid NPT mapping for the APIC base, which the CPU can cache as a non-AVIC translation. Note, Intel handles this in vmx_set_virtual_apic_mode(). Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6919dee69f18..712330b80891 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -86,6 +86,12 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* Disabling MSR intercept for x2APIC registers */ svm_set_x2apic_msr_interception(svm, false); } else { + /* + * Flush the TLB, the guest may have inserted a non-APIC + * mapping into the TLB while AVIC was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ -- cgit v1.2.3 From 5aede752a839904059c2b5d68be0dc4501c6c15f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:37 +0000 Subject: KVM: SVM: Process ICR on AVIC IPI delivery failure due to invalid target Emulate ICR writes on AVIC IPI failures due to invalid targets using the same logic as failures due to invalid types. AVIC acceleration fails if _any_ of the targets are invalid, and crucially VM-Exits before sending IPIs to targets that _are_ valid. In logical mode, the destination is a bitmap, i.e. a single IPI can target multiple logical IDs. Doing nothing causes KVM to drop IPIs if at least one target is valid and at least one target is invalid. Fixes: 18f40c53e10f ("svm: Add VMEXIT handlers for AVIC") Cc: stable@vger.kernel.org Reviewed-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 712330b80891..3b2c88b168ba 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -502,14 +502,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); switch (id) { + case AVIC_IPI_FAILURE_INVALID_TARGET: case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* * Emulate IPIs that are not handled by AVIC hardware, which - * only virtualizes Fixed, Edge-Triggered INTRs. The exit is - * a trap, e.g. ICR holds the correct value and RIP has been - * advanced, KVM is responsible only for emulating the IPI. - * Sadly, hardware may sometimes leave the BUSY flag set, in - * which case KVM needs to emulate the ICR write as well in + * only virtualizes Fixed, Edge-Triggered INTRs, and falls over + * if _any_ targets are invalid, e.g. if the logical mode mask + * is a superset of running vCPUs. + * + * The exit is a trap, e.g. ICR holds the correct value and RIP + * has been advanced, KVM is responsible only for emulating the + * IPI. Sadly, hardware may sometimes leave the BUSY flag set, + * in which case KVM needs to emulate the ICR write as well in * order to clear the BUSY flag. */ if (icrl & APIC_ICR_BUSY) @@ -525,8 +529,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) */ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); break; - case AVIC_IPI_FAILURE_INVALID_TARGET: - break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; -- cgit v1.2.3 From a58a66afc464d6d2ec294cd3102f36f3652e7ce4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:38 +0000 Subject: KVM: x86: Don't inhibit APICv/AVIC on xAPIC ID "change" if APIC is disabled Don't inhibit APICv/AVIC due to an xAPIC ID mismatch if the APIC is hardware disabled. The ID cannot be consumed while the APIC is disabled, and the ID is guaranteed to be set back to the vcpu_id when the APIC is hardware enabled (architectural behavior correctly emulated by KVM). Fixes: 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 33a661d82da7..191b5a962700 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2072,6 +2072,9 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) { struct kvm *kvm = apic->vcpu->kvm; + if (!kvm_apic_hw_enabled(apic)) + return; + if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm)) return; -- cgit v1.2.3 From f651a008954803d7bb2d85b7042d0fd46133d782 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:39 +0000 Subject: KVM: x86: Don't inhibit APICv/AVIC if xAPIC ID mismatch is due to 32-bit ID Truncate the vcpu_id, a.k.a. x2APIC ID, to an 8-bit value when comparing it against the xAPIC ID to avoid false positives (sort of) on systems with >255 CPUs, i.e. with IDs that don't fit into a u8. The intent of APIC_ID_MODIFIED is to inhibit APICv/AVIC when the xAPIC is changed from it's original value, The mismatch isn't technically a false positive, as architecturally the xAPIC IDs do end up being aliased in this scenario, and neither APICv nor AVIC correctly handles IPI virtualization when there is aliasing. However, KVM already deliberately does not honor the aliasing behavior that results when an x2APIC ID gets truncated to an xAPIC ID. I.e. the resulting APICv/AVIC behavior is aligned with KVM's existing behavior when KVM's x2APIC hotplug hack is effectively enabled. If/when KVM provides a way to disable the hotplug hack, APICv/AVIC can piggyback whatever logic disables the optimized APIC map (which is what provides the hotplug hack), i.e. so that KVM's optimized map and APIC virtualization yield the same behavior. For now, fix the immediate problem of APIC virtualization being disabled for large VMs, which is a much more pressing issue than ensuring KVM honors architectural behavior for APIC ID aliasing. Fixes: 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") Reported-by: Suravee Suthikulpanit Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 191b5a962700..2183a9b8efa5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2078,7 +2078,12 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm)) return; - if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id) + /* + * Deliberately truncate the vCPU ID when detecting a modified APIC ID + * to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a 32-bit + * value. + */ + if (kvm_xapic_id(apic) == (u8)apic->vcpu->vcpu_id) return; kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); -- cgit v1.2.3 From e0bead97e7590da888148feb9e9133bc278c534b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:40 +0000 Subject: KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 31 +++++++++++++++---------------- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3b2c88b168ba..97ad0661f963 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -747,18 +747,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) - return; - - if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { - WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); - return; - } - avic_refresh_apicv_exec_ctrl(vcpu); -} - static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -1100,17 +1088,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } - -void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; - bool activated = kvm_vcpu_apicv_active(vcpu); + + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; if (!enable_apicv) return; - if (activated) { + if (kvm_vcpu_apicv_active(vcpu)) { /** * During AVIC temporary deactivation, guest could update * APIC ID, DFR and LDR registers, which would not be trapped @@ -1124,6 +1113,16 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); +} + +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + bool activated = kvm_vcpu_apicv_active(vcpu); + + if (!enable_apicv) + return; + + avic_refresh_virtual_apic_mode(vcpu); if (activated) avic_vcpu_load(vcpu, vcpu->cpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6ffadbd57744..26044e1d2422 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4771,7 +4771,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = avic_set_virtual_apic_mode, + .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4826e6cc611b..d0ed3f595229 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -648,7 +648,7 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -- cgit v1.2.3 From 1459f5c6b8b8dfbc16adf4844421d46459c9ab1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:41 +0000 Subject: KVM: x86: Handle APICv updates for APIC "mode" changes via request Use KVM_REQ_UPDATE_APICV to react to APIC "mode" changes, i.e. to handle the APIC being hardware enabled/disabled and/or x2APIC being toggled. There is no need to immediately update APICv state, the only requirement is that APICv be updating prior to the next VM-Enter. Making a request will allow piggybacking KVM_REQ_UPDATE_APICV to "inhibit" the APICv memslot when x2APIC is enabled. Doing that directly from kvm_lapic_set_base() isn't feasible as KVM's SRCU must not be held when modifying memslots (to avoid deadlock), and may or may not be held when kvm_lapic_set_base() is called, i.e. KVM can't do the right thing without tracking that is rightly buried behind CONFIG_PROVE_RCU=y. Suggested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2183a9b8efa5..3ed74ad60516 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2401,7 +2401,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { - kvm_vcpu_update_apicv(vcpu); + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); } -- cgit v1.2.3 From c482f2cebe2d169c43c75ca769bdeba16fce6036 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:42 +0000 Subject: KVM: x86: Move APIC access page helper to common x86 code Move the APIC access page allocation helper function to common x86 code, the allocation routine is virtually identical between APICv (VMX) and AVIC (SVM). Keep APICv's gfn_to_page() + put_page() sequence, which verifies that a backing page can be allocated, i.e. that the system isn't under heavy memory pressure. Forcing the backing page to be populated isn't strictly necessary, but skipping the effective prefetch only delays the inevitable. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/svm/avic.c | 41 +++++++---------------------------------- arch/x86/kvm/vmx/vmx.c | 35 +---------------------------------- 4 files changed, 44 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ed74ad60516..e73386c26d2c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2435,6 +2435,41 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->highest_isr_cache = -1; } +int kvm_alloc_apic_access_page(struct kvm *kvm) +{ + struct page *page; + void __user *hva; + int ret = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled) + goto out; + + hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (IS_ERR(hva)) { + ret = PTR_ERR(hva); + goto out; + } + + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) { + ret = -EFAULT; + goto out; + } + + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_memslot_enabled = true; +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); + void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 58c3242fcc7a..8c6442751dab 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -112,6 +112,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); +int kvm_alloc_apic_access_page(struct kvm *kvm); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 97ad0661f963..ec28ba4c5f1b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -256,39 +256,6 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, return &avic_physical_id_table[index]; } -/* - * Note: - * AVIC hardware walks the nested page table to check permissions, - * but does not use the SPA address specified in the leaf page - * table entry since it uses address in the AVIC_BACKING_PAGE pointer - * field of the VMCB. Therefore, we set up the - * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. - */ -static int avic_alloc_access_page(struct kvm *kvm) -{ - void __user *ret; - int r = 0; - - mutex_lock(&kvm->slots_lock); - - if (kvm->arch.apic_access_memslot_enabled) - goto out; - - ret = __x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); - if (IS_ERR(ret)) { - r = PTR_ERR(ret); - goto out; - } - - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - static int avic_init_backing_page(struct kvm_vcpu *vcpu) { u64 *entry, new_entry; @@ -305,7 +272,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_alloc_access_page(vcpu->kvm); + /* + * Note, AVIC hardware walks the nested page table to check + * permissions, but does not use the SPA address specified in + * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE + * pointer field of the VMCB. + */ + ret = kvm_alloc_apic_access_page(vcpu->kvm); if (ret) return ret; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0220e22b89ca..8ed14fcfe870 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3808,39 +3808,6 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, ar); } -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct page *page; - void __user *hva; - int ret = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_memslot_enabled) - goto out; - hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } - - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; -} - int allocate_vpid(void) { int vpid; @@ -7394,7 +7361,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { - err = alloc_apic_access_page(vcpu->kvm); + err = kvm_alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } -- cgit v1.2.3 From 2008fab3453019fd89fa06029be95d30e1a8e66c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:43 +0000 Subject: KVM: x86: Inhibit APIC memslot if x2APIC and AVIC are enabled Free the APIC access page memslot if any vCPU enables x2APIC and SVM's AVIC is enabled to prevent accesses to the virtual APIC on vCPUs with x2APIC enabled. On AMD, if its "hybrid" mode is enabled (AVIC is enabled when x2APIC is enabled even without x2AVIC support), keeping the APIC access page memslot results in the guest being able to access the virtual APIC page as x2APIC is fully emulated by KVM. I.e. hardware isn't aware that the guest is operating in x2APIC mode. Exempt nested SVM's update of APICv state from the new logic as x2APIC can't be toggled on VM-Exit. In practice, invoking the x2APIC logic should be harmless precisely because it should be a glorified nop, but play it safe to avoid latent bugs, e.g. with dropping the vCPU's SRCU lock. Intel doesn't suffer from the same issue as APICv has fully independent VMCS controls for xAPIC vs. x2APIC virtualization. Technically, KVM should provide bus error semantics and not memory semantics for the APIC page when x2APIC is enabled, but KVM already provides memory semantics in other scenarios, e.g. if APICv/AVIC is enabled and the APIC is hardware disabled (via APIC_BASE MSR). Note, checking apic_access_memslot_enabled without taking locks relies it being set during vCPU creation (before kvm_vcpu_reset()). vCPUs can race to set the inhibit and delete the memslot, i.e. can get false positives, but can't get false negatives as apic_access_memslot_enabled can't be toggled "on" once any vCPU reaches KVM_RUN. Opportunistically drop the "can" while updating avic_activate_vmcb()'s comment, i.e. to state that KVM _does_ support the hybrid mode. Move the "Note:" down a line to conform to preferred kernel/KVM multi-line comment style. Opportunistically update the apicv_update_lock comment, as it isn't actually used to protect apic_access_memslot_enabled (which is protected by slots_lock). Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode") Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 10 ++++++---- arch/x86/kvm/lapic.c | 38 +++++++++++++++++++++++++++++++++++++- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/svm/avic.c | 12 ++++++------ arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++++-- 7 files changed, 78 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c70690b2c82d..1d92c148e799 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1249,10 +1249,11 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; - /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ - struct rw_semaphore apicv_update_lock; - bool apic_access_memslot_enabled; + bool apic_access_memslot_inhibited; + + /* Protects apicv_inhibit_reasons */ + struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1599,6 +1600,7 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(int isr); @@ -1973,7 +1975,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e73386c26d2c..355ea688df4a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2442,7 +2442,8 @@ int kvm_alloc_apic_access_page(struct kvm *kvm) int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_memslot_enabled) + if (kvm->arch.apic_access_memslot_enabled || + kvm->arch.apic_access_memslot_inhibited) goto out; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, @@ -2470,6 +2471,41 @@ out: } EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); +void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.apic_access_memslot_enabled) + return; + + kvm_vcpu_srcu_read_unlock(vcpu); + + mutex_lock(&kvm->slots_lock); + + if (kvm->arch.apic_access_memslot_enabled) { + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); + /* + * Clear "enabled" after the memslot is deleted so that a + * different vCPU doesn't get a false negative when checking + * the flag out of slots_lock. No additional memory barrier is + * needed as modifying memslots requires waiting other vCPUs to + * drop SRCU (see above), and false positives are ok as the + * flag is rechecked after acquiring slots_lock. + */ + kvm->arch.apic_access_memslot_enabled = false; + + /* + * Mark the memslot as inhibited to prevent reallocating the + * memslot during vCPU creation, e.g. if a vCPU is hotplugged. + */ + kvm->arch.apic_access_memslot_inhibited = true; + } + + mutex_unlock(&kvm->slots_lock); + + kvm_vcpu_srcu_read_lock(vcpu); +} + void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 8c6442751dab..df316ede7546 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -113,6 +113,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); int kvm_alloc_apic_access_page(struct kvm *kvm); +void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ec28ba4c5f1b..0a75993afed6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -72,12 +72,12 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - /* Note: - * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC - * MSR accesses, while interrupt injection to a running vCPU - * can be achieved using AVIC doorbell. The AVIC hardware still - * accelerate MMIO accesses, but this does not cause any harm - * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + /* + * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR + * accesses, while interrupt injection to a running vCPU can be + * achieved using AVIC doorbell. KVM disables the APIC access page + * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling + * AVIC in hybrid mode activates only the doorbell mechanism. */ if (apic_x2apic_mode(svm->vcpu.arch.apic) && avic_mode == AVIC_MODE_X2) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bc9cd7086fa9..34ac03969f28 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1106,7 +1106,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * to benefit from it right away. */ if (kvm_apicv_activated(vcpu->kvm)) - kvm_vcpu_update_apicv(vcpu); + __kvm_vcpu_update_apicv(vcpu); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 26044e1d2422..7651d665723e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5028,6 +5028,8 @@ static __init int svm_hardware_setup(void) svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; + } else if (avic_mode == AVIC_MODE_X1) { + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } if (vls) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39b8dd37bc40..1abe3f1e821c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10045,7 +10045,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; bool activate; @@ -10080,7 +10080,30 @@ out: preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); +EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); + +static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return; + + /* + * Due to sharing page tables across vCPUs, the xAPIC memslot must be + * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but + * and hardware doesn't support x2APIC virtualization. E.g. some AMD + * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in + * this case so that KVM can the AVIC doorbell to inject interrupts to + * running vCPUs, but KVM must not create SPTEs for the APIC base as + * the vCPU would incorrectly be able to access the vAPIC page via MMIO + * despite being in x2APIC mode. For simplicity, inhibiting the APIC + * access page is sticky. + */ + if (apic_x2apic_mode(vcpu->arch.apic) && + kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization) + kvm_inhibit_apic_access_page(vcpu); + + __kvm_vcpu_update_apicv(vcpu); +} void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set) -- cgit v1.2.3 From f628a34a9d5228d963e3c2f15e6ee92856a0a66b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:44 +0000 Subject: KVM: SVM: Replace "avic_mode" enum with "x2avic_enabled" boolean Replace the "avic_mode" enum with a single bool to track whether or not x2AVIC is enabled. KVM already has "apicv_enabled" that tracks if any flavor of AVIC is enabled, i.e. AVIC_MODE_NONE and AVIC_MODE_X1 are redundant and unnecessary noise. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 46 +++++++++++++++++++++------------------------- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 9 +-------- 3 files changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0a75993afed6..10b0e996e2e3 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -53,7 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -enum avic_modes avic_mode; +bool x2avic_enabled; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -79,8 +79,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling * AVIC in hybrid mode activates only the doorbell mechanism. */ - if (apic_x2apic_mode(svm->vcpu.arch.apic) && - avic_mode == AVIC_MODE_X2) { + if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ @@ -247,8 +246,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) + if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || + (index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -262,8 +261,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) + if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || + (id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -1066,10 +1065,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; - if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) - return; - - if (!enable_apicv) + if (!lapic_in_kernel(vcpu) || !enable_apicv) return; if (kvm_vcpu_apicv_active(vcpu)) { @@ -1145,32 +1141,32 @@ bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) if (!npt_enabled) return false; + /* AVIC is a prerequisite for x2AVIC. */ + if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { - avic_mode = AVIC_MODE_X1; pr_info("AVIC enabled\n"); } else if (force_avic) { /* * Some older systems does not advertise AVIC support. * See Revision Guide for specific AMD processor for more detail. */ - avic_mode = AVIC_MODE_X1; pr_warn("AVIC is not supported in CPUID but force enabled"); pr_warn("Your system might crash and burn"); } /* AVIC is a prerequisite for x2AVIC. */ - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - if (avic_mode == AVIC_MODE_X1) { - avic_mode = AVIC_MODE_X2; - pr_info("x2AVIC enabled\n"); - } else { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } - } + x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); + if (x2avic_enabled) + pr_info("x2AVIC enabled\n"); - if (avic_mode != AVIC_MODE_NONE) - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - return !!avic_mode; + return true; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7651d665723e..9f172f2de195 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -813,7 +813,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (avic_mode != AVIC_MODE_X2 || + if (!x2avic_enabled || !apic_x2apic_mode(svm->vcpu.arch.apic)) return; @@ -5028,7 +5028,7 @@ static __init int svm_hardware_setup(void) svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; - } else if (avic_mode == AVIC_MODE_X1) { + } else if (!x2avic_enabled) { svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d0ed3f595229..546825c82490 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -35,14 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; - -enum avic_modes { - AVIC_MODE_NONE = 0, - AVIC_MODE_X1, - AVIC_MODE_X2, -}; - -extern enum avic_modes avic_mode; +extern bool x2avic_enabled; /* * Clean bits in VMCB. -- cgit v1.2.3 From a879a88e05f36153987ad9836e85009eb4114346 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:45 +0000 Subject: KVM: SVM: Compute dest based on sender's x2APIC status for AVIC kick Compute the destination from ICRH using the sender's x2APIC status, not each (potential) target's x2APIC status. Fixes: c514d3a348ac ("KVM: SVM: Update avic_kick_target_vcpus to support 32-bit APIC ID") Cc: Li RongQing Signed-off-by: Sean Christopherson Reviewed-by: Li RongQing Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 10b0e996e2e3..f2db0021c45f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -429,6 +429,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { + u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); unsigned long i; struct kvm_vcpu *vcpu; @@ -444,13 +445,6 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { - u32 dest; - - if (apic_x2apic_mode(vcpu->arch.apic)) - dest = icrh; - else - dest = GET_XAPIC_DEST_FIELD(icrh); - if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; -- cgit v1.2.3 From da3fb46d226a8c1b61d309f3b99056c18b8d93e2 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Fri, 6 Jan 2023 01:12:46 +0000 Subject: KVM: SVM: Fix x2APIC Logical ID calculation for avic_kick_target_vcpus_fast For X2APIC ID in cluster mode, the logical ID is bit [15:0]. Fixes: 603ccef42ce9 ("KVM: x86: SVM: fix avic_kick_target_vcpus_fast") Cc: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f2db0021c45f..0f67fd34ef99 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -356,7 +356,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) { /* 16 bit dest mask, 16 bit cluster id */ - bitmap = dest & 0xFFFF0000; + bitmap = dest & 0xFFFF; cluster = (dest >> 16) << 4; } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { /* 8 bit dest mask*/ -- cgit v1.2.3 From f9829c9076616afa4f4811f2fc2992a734593d59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:47 +0000 Subject: Revert "KVM: SVM: Use target APIC ID to complete x2AVIC IRQs when possible" Due to a likely mismerge of patches, KVM ended up with a superfluous commit to "enable" AVIC's fast path for x2AVIC mode. Even worse, the superfluous commit has several bugs and creates a nasty local shadow variable. Rather than fix the bugs piece-by-piece[*] to achieve the same end result, revert the patch wholesale. Opportunistically add a comment documenting the x2AVIC dependencies. This reverts commit 8c9e639da435874fb845c4d296ce55664071ea7a. [*] https://lore.kernel.org/all/YxEP7ZBRIuFWhnYJ@google.com Fixes: 8c9e639da435 ("KVM: SVM: Use target APIC ID to complete x2AVIC IRQs when possible") Suggested-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0f67fd34ef99..e4b5f8b14882 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -378,7 +378,17 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (!apic_x2apic_mode(source)) { + if (apic_x2apic_mode(source)) { + /* + * For x2APIC, the logical APIC ID is a read-only value + * that is derived from the x2APIC ID, thus the x2APIC + * ID can be found by reversing the calculation (done + * above). Note, bits 31:20 of the x2APIC ID are not + * propagated to the logical ID, but KVM limits the + * x2APIC ID limited to KVM_MAX_VCPU_IDS. + */ + l1_physical_id = logid_index; + } else { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -393,23 +403,6 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; - } else { - /* - * For x2APIC logical mode, cannot leverage the index. - * Instead, calculate physical ID from logical ID in ICRH. - */ - int cluster = (icrh & 0xffff0000) >> 16; - int apic = ffs(icrh & 0xffff) - 1; - - /* - * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) - * contains anything but a single bit, we cannot use the - * fast path, because it is limited to a single vCPU. - */ - if (apic < 0 || icrh != (1 << apic)) - return -EINVAL; - - l1_physical_id = (cluster << 4) + apic; } } -- cgit v1.2.3 From 8578e4512d873b0aa9805e9772162578d54bb6a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:48 +0000 Subject: KVM: SVM: Document that vCPU ID == APIC ID in AVIC kick fastpatch Document that AVIC is inhibited if any vCPU's APIC ID diverges from its vCPU ID, i.e. that there's no need to check for a destination match in the AVIC kick fast path. Opportunistically tweak comments to remove "guest bug", as that suggests KVM is punting on error handling, which is not the case. Targeting a non-existent vCPU or no vCPUs _may_ be a guest software bug, but whether or not it's a guest bug is irrelevant. Such behavior is architecturally legal and thus needs to faithfully emulated by KVM (and it is). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e4b5f8b14882..eb2ad5b54877 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -368,8 +368,8 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source cluster = (dest >> 4) << 2; } + /* Nothing to do if there are no destinations in the cluster. */ if (unlikely(!bitmap)) - /* guest bug: nobody to send the logical interrupt to */ return 0; if (!is_power_of_2(bitmap)) @@ -397,7 +397,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (WARN_ON_ONCE(index != logid_index)) return -EINVAL; - /* guest bug: non existing/reserved logical destination */ + /* Nothing to do if the logical destination is invalid. */ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) return 0; @@ -406,9 +406,13 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source } } + /* + * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, + * i.e. APIC ID == vCPU ID. Once again, nothing to do if the target + * vCPU doesn't exist. + */ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); if (unlikely(!target_vcpu)) - /* guest bug: non existing vCPU is a target of this IPI*/ return 0; target_vcpu->arch.apic->irr_pending = true; -- cgit v1.2.3 From 1d22a597b3e9fd4d0a7e921ce4d32321bcad8576 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:49 +0000 Subject: KVM: SVM: Add helper to perform final AVIC "kick" of single vCPU Add a helper to perform the final kick, two instances of the ICR decoding is one too many. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eb2ad5b54877..76da9f19272e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -317,6 +317,16 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) put_cpu(); } + +static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) +{ + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); +} + /* * A fast-path version of avic_kick_target_vcpus(), which attempts to match * destination APIC ID to vCPU without looping through all vCPUs. @@ -415,11 +425,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (unlikely(!target_vcpu)) return 0; - target_vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(target_vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); + avic_kick_vcpu(target_vcpu, icrl); return 0; } @@ -443,13 +449,8 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, */ kvm_for_each_vcpu(i, vcpu, kvm) { if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - dest, icrl & APIC_DEST_MASK)) { - vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); - } + dest, icrl & APIC_DEST_MASK)) + avic_kick_vcpu(vcpu, icrl); } } -- cgit v1.2.3 From 6ea567ca003ab05adef28459bde1495a250dd7b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:50 +0000 Subject: KVM: x86: Explicitly skip optimized logical map setup if vCPU's LDR==0 Explicitly skip the optimized map setup if the vCPU's LDR is '0', i.e. if the vCPU will never respond to logical mode interrupts. KVM already skips setup in this case, but relies on kvm_apic_map_get_logical_dest() to generate mask==0. KVM still needs the mask=0 check as a non-zero LDR can yield mask==0 depending on the mode, but explicitly handling the LDR will make it simpler to clean up the logical mode tracking in the future. No functional change intended. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-18-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 355ea688df4a..2aee712e42bb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -286,10 +286,12 @@ void kvm_recalculate_apic_map(struct kvm *kvm) continue; ldr = kvm_lapic_get_reg(apic, APIC_LDR); + if (!ldr) + continue; if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; - } else if (ldr) { + } else { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) new->mode |= KVM_APIC_MODE_XAPIC_FLAT; -- cgit v1.2.3 From 35366901017ca20ccf72365d2922361459cb3121 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:51 +0000 Subject: KVM: x86: Explicitly track all possibilities for APIC map's logical modes Track all possibilities for the optimized APIC map's logical modes instead of overloading the pseudo-bitmap and treating any "unknown" value as "invalid". As documented by the now-stale comment above the mode values, the values did have meaning when the optimized map was originally added. That dependent logical was removed by commit e45115b62f9a ("KVM: x86: use physical LAPIC array for logical x2APIC"), but the obfuscated behavior and its comment were left behind. Opportunistically rename "mode" to "logical_mode", partly to make it clear that the "disabled" case applies only to the logical map, but also to prove that there is no lurking code that expects "mode" to be a bitmap. Functionally, this is a glorified nop. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 29 ++++++++++++++++++++--------- arch/x86/kvm/lapic.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1d92c148e799..732421a0ad02 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1022,19 +1022,30 @@ struct kvm_arch_memory_slot { }; /* - * We use as the mode the number of bits allocated in the LDR for the - * logical processor ID. It happens that these are all powers of two. - * This makes it is very easy to detect cases where the APICs are - * configured for multiple modes; in that case, we cannot use the map and - * hence cannot use kvm_irq_delivery_to_apic_fast either. + * Track the mode of the optimized logical map, as the rules for decoding the + * destination vary per mode. Enabling the optimized logical map requires all + * software-enabled local APIs to be in the same mode, each addressable APIC to + * be mapped to only one MDA, and each MDA to map to at most one APIC. */ -#define KVM_APIC_MODE_XAPIC_CLUSTER 4 -#define KVM_APIC_MODE_XAPIC_FLAT 8 -#define KVM_APIC_MODE_X2APIC 16 +enum kvm_apic_logical_mode { + /* All local APICs are software disabled. */ + KVM_APIC_MODE_SW_DISABLED, + /* All software enabled local APICs in xAPIC cluster addressing mode. */ + KVM_APIC_MODE_XAPIC_CLUSTER, + /* All software enabled local APICs in xAPIC flat addressing mode. */ + KVM_APIC_MODE_XAPIC_FLAT, + /* All software enabled local APICs in x2APIC mode. */ + KVM_APIC_MODE_X2APIC, + /* + * Optimized map disabled, e.g. not all local APICs in the same logical + * mode, same logical ID assigned to multiple APICs, etc. + */ + KVM_APIC_MODE_MAP_DISABLED, +}; struct kvm_apic_map { struct rcu_head rcu; - u8 mode; + enum kvm_apic_logical_mode logical_mode; u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2aee712e42bb..2567998b692c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -168,7 +168,12 @@ static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { - switch (map->mode) { + switch (map->logical_mode) { + case KVM_APIC_MODE_SW_DISABLED: + /* Arbitrarily use the flat map so that @cluster isn't NULL. */ + *cluster = map->xapic_flat_map; + *mask = 0; + return true; case KVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; @@ -193,8 +198,10 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; + case KVM_APIC_MODE_MAP_DISABLED: + return false; default: - /* Not optimized. */ + WARN_ON_ONCE(1); return false; } } @@ -256,10 +263,12 @@ void kvm_recalculate_apic_map(struct kvm *kvm) goto out; new->max_apic_id = max_id; + new->logical_mode = KVM_APIC_MODE_SW_DISABLED; kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; + enum kvm_apic_logical_mode logical_mode; u16 mask; u32 ldr; u8 xapic_id; @@ -282,7 +291,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; - if (!kvm_apic_sw_enabled(apic)) + if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED || + !kvm_apic_sw_enabled(apic)) continue; ldr = kvm_lapic_get_reg(apic, APIC_LDR); @@ -290,17 +300,31 @@ void kvm_recalculate_apic_map(struct kvm *kvm) continue; if (apic_x2apic_mode(apic)) { - new->mode |= KVM_APIC_MODE_X2APIC; + logical_mode = KVM_APIC_MODE_X2APIC; } else { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) - new->mode |= KVM_APIC_MODE_XAPIC_FLAT; + logical_mode = KVM_APIC_MODE_XAPIC_FLAT; else - new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; + logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; + } + + /* + * To optimize logical mode delivery, all software-enabled APICs must + * be configured for the same mode. + */ + if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { + new->logical_mode = logical_mode; + } else if (new->logical_mode != logical_mode) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + continue; } - if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) + if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, + &cluster, &mask))) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; continue; + } if (mask) cluster[ffs(mask) - 1] = apic; @@ -953,7 +977,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && - map->mode != KVM_APIC_MODE_X2APIC)) + map->logical_mode != KVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; -- cgit v1.2.3 From 76e527509d37a15ff1714ddd003384f5f25fd3fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:52 +0000 Subject: KVM: x86: Skip redundant x2APIC logical mode optimized cluster setup Skip the optimized cluster[] setup for x2APIC logical mode, as KVM reuses the optimized map's phys_map[] and doesn't actually need to insert the target apic into the cluster[]. The LDR is derived from the x2APIC ID, and both are read-only in KVM, thus the vCPU's cluster[ldr] is guaranteed to be the same entry as the vCPU's phys_map[x2apic_id] entry. Skipping the unnecessary setup will allow a future fix for aliased xAPIC logical IDs to simply require that cluster[ldr] is non-NULL, i.e. won't have to special case x2APIC. Alternatively, the future check could allow "cluster[ldr] == apic", but that ends up being terribly confusing because cluster[ldr] is only set at the very end, i.e. it's only possible due to x2APIC's shenanigans. Another alternative would be to send x2APIC down a separate path _after_ the calculation and then assert that all of the above, but the resulting code is rather messy, and it's arguably unnecessary since asserting that the actual LDR matches the expected LDR means that simply testing that interrupts are delivered correctly provides the same guarantees. Reported-by: Suravee Suthikulpanit Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2567998b692c..fd7726ff95c6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -166,6 +166,11 @@ static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ + return ((id >> 4) << 16) | (1 << (id & 0xf)); +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->logical_mode) { @@ -320,6 +325,18 @@ void kvm_recalculate_apic_map(struct kvm *kvm) continue; } + /* + * In x2APIC mode, the LDR is read-only and derived directly + * from the x2APIC ID, thus is guaranteed to be addressable. + * KVM reuses kvm_apic_map.phys_map to optimize logical mode + * x2APIC interrupts by reversing the LDR calculation to get + * cluster of APICs, i.e. no additional work is required. + */ + if (apic_x2apic_mode(apic)) { + WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(x2apic_id)); + continue; + } + if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; @@ -386,11 +403,6 @@ static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } -static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) -{ - return ((id >> 4) << 16) | (1 << (id & 0xf)); -} - static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = kvm_apic_calc_x2apic_ldr(id); -- cgit v1.2.3 From 2bf934aadcac310810cc4fa120d1b576cae7e9da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:53 +0000 Subject: KVM: x86: Disable APIC logical map if logical ID covers multiple MDAs Disable the optimized APIC logical map if a logical ID covers multiple MDAs, i.e. if a vCPU has multiple bits set in its ID. In logical mode, events match if "ID & MDA != 0", i.e. creating an entry for only the first bit can cause interrupts to be missed. Note, creating an entry for every bit is also wrong as KVM would generate IPIs for every matching bit. It would be possible to teach KVM to play nice with this edge case, but it is very much an edge case and probably not used in any real world OS, i.e. it's not worth optimizing. Fixes: 1e08ec4a130e ("KVM: optimize apic interrupt delivery") Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fd7726ff95c6..dca87bb6dd1a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -343,8 +343,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm) continue; } - if (mask) - cluster[ffs(mask) - 1] = apic; + if (!mask) + continue; + + if (!is_power_of_2(mask)) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + continue; + } + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, -- cgit v1.2.3 From 2970052481b9f93e1849f5d4a1065e9fafc8d662 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:54 +0000 Subject: KVM: x86: Disable APIC logical map if vCPUs are aliased in logical mode Disable the optimized APIC logical map if multiple vCPUs are aliased to the same logical ID. Architecturally, all CPUs whose logical ID matches the MDA are supposed to receive the interrupt; overwriting existing map entries can result in missed IPIs. Fixes: 1e08ec4a130e ("KVM: optimize apic interrupt delivery") Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dca87bb6dd1a..9c0554bae3b1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -346,11 +346,12 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!mask) continue; - if (!is_power_of_2(mask)) { + ldr = ffs(mask) - 1; + if (!is_power_of_2(mask) || cluster[ldr]) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; continue; } - cluster[ffs(mask) - 1] = apic; + cluster[ldr] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, -- cgit v1.2.3 From 5b84b0291702ea84db2c9e55696cdcdd95f9cf1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:55 +0000 Subject: KVM: x86: Honor architectural behavior for aliased 8-bit APIC IDs Apply KVM's hotplug hack if and only if userspace has enabled 32-bit IDs for x2APIC. If 32-bit IDs are not enabled, disable the optimized map to honor x86 architectural behavior if multiple vCPUs shared a physical APIC ID. As called out in the changelog that added the hack, all CPUs whose (possibly truncated) APIC ID matches the target are supposed to receive the IPI. KVM intentionally differs from real hardware, because real hardware (Knights Landing) does just "x2apic_id & 0xff" to decide whether to accept the interrupt in xAPIC mode and it can deliver one interrupt to more than one physical destination, e.g. 0x123 to 0x123 and 0x23. Applying the hack even when x2APIC is not fully enabled means KVM doesn't correctly handle scenarios where the guest has aliased xAPIC IDs across multiple vCPUs, as only the vCPU with the lowest vCPU ID will receive any interrupts. It's extremely unlikely any real world guest aliases APIC IDs, or even modifies APIC IDs, but KVM's behavior is arbitrary, e.g. the lowest vCPU ID "wins" regardless of which vCPU is "aliasing" and which vCPU is "normal". Furthermore, the hack is _not_ guaranteed to work! The hack works if and only if the optimized APIC map is successfully allocated. If the map allocation fails (unlikely), KVM will fall back to its unoptimized behavior, which _does_ honor the architectural behavior. Pivot on 32-bit x2APIC IDs being enabled as that is required to take advantage of the hotplug hack (see kvm_apic_state_fixup()), i.e. won't break existing setups unless they are way, way off in the weeds. And an entry in KVM's errata to document the hack. Alternatively, KVM could provide an actual x2APIC quirk and document the hack that way, but there's unlikely to ever be a use case for disabling the quirk. Go the errata route to avoid having to validate a quirk no one cares about. Fixes: 5bd5db385b3e ("KVM: x86: allow hotplug of VCPU with APIC ID over 0xff") Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/x86/errata.rst | 11 ++++++++ arch/x86/kvm/lapic.c | 50 ++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst index 410e0aa63493..49a05f24747b 100644 --- a/Documentation/virt/kvm/x86/errata.rst +++ b/Documentation/virt/kvm/x86/errata.rst @@ -37,3 +37,14 @@ Nested virtualization features ------------------------------ TBD + +x2APIC +------ +When KVM_X2APIC_API_USE_32BIT_IDS is enabled, KVM activates a hack/quirk that +allows sending events to a single vCPU using its x2APIC ID even if the target +vCPU has legacy xAPIC enabled, e.g. to bring up hotplugged vCPUs via INIT-SIPI +on VMs with > 255 vCPUs. A side effect of the quirk is that, if multiple vCPUs +have the same physical APIC ID, KVM will deliver events targeting that APIC ID +only to the vCPU with the lowest vCPU ID. If KVM_X2APIC_API_USE_32BIT_IDS is +not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs +matching the target APIC ID receive the interrupt). diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9c0554bae3b1..e9f258de91bd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -274,10 +274,10 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; enum kvm_apic_logical_mode logical_mode; + u32 x2apic_id, physical_id; u16 mask; u32 ldr; u8 xapic_id; - u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; @@ -285,16 +285,48 @@ void kvm_recalculate_apic_map(struct kvm *kvm) xapic_id = kvm_xapic_id(apic); x2apic_id = kvm_x2apic_id(apic); - /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ - if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && - x2apic_id <= new->max_apic_id) - new->phys_map[x2apic_id] = apic; /* - * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, - * prevent them from masking VCPUs with APIC ID <= 0xff. + * Apply KVM's hotplug hack if userspace has enable 32-bit APIC + * IDs. Allow sending events to vCPUs by their x2APIC ID even + * if the target vCPU is in legacy xAPIC mode, and silently + * ignore aliased xAPIC IDs (the x2APIC ID is truncated to 8 + * bits, causing IDs > 0xff to wrap and collide). + * + * Honor the architectural (and KVM's non-optimized) behavior + * if userspace has not enabled 32-bit x2APIC IDs. Each APIC + * is supposed to process messages independently. If multiple + * vCPUs have the same effective APIC ID, e.g. due to the + * x2APIC wrap or because the guest manually modified its xAPIC + * IDs, events targeting that ID are supposed to be recognized + * by all vCPUs with said ID. */ - if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) - new->phys_map[xapic_id] = apic; + if (kvm->arch.x2apic_format) { + /* See also kvm_apic_match_physical_addr(). */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + } else { + /* + * Disable the optimized map if the physical APIC ID is + * already mapped, i.e. is aliased to multiple vCPUs. + * The optimized map requires a strict 1:1 mapping + * between IDs and vCPUs. + */ + if (apic_x2apic_mode(apic)) + physical_id = x2apic_id; + else + physical_id = xapic_id; + + if (new->phys_map[physical_id]) { + kvfree(new); + new = NULL; + goto out; + } + new->phys_map[physical_id] = apic; + } if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED || !kvm_apic_sw_enabled(apic)) -- cgit v1.2.3 From 5063c41bebac8d18a83ac7b5d54782d73a9ab5f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:56 +0000 Subject: KVM: x86: Inhibit APICv/AVIC if the optimized physical map is disabled Inhibit APICv/AVIC if the optimized physical map is disabled so that KVM KVM provides consistent APIC behavior if xAPIC IDs are aliased due to vcpu_id being truncated and the x2APIC hotplug hack isn't enabled. If the hotplug hack is disabled, events that are emulated by KVM will follow architectural behavior (all matching vCPUs receive events, even if the "match" is due to truncation), whereas APICv and AVIC will deliver events only to the first matching vCPU, i.e. the vCPU that matches without truncation. Note, the "extra" inhibit is needed because KVM deliberately ignores mismatches due to truncation when applying the APIC_ID_MODIFIED inhibit so that large VMs (>255 vCPUs) can run with APICv/AVIC. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/lapic.c | 13 ++++++++++++- arch/x86/kvm/svm/avic.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 732421a0ad02..5865bad08a23 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1174,6 +1174,12 @@ enum kvm_apicv_inhibit { */ APICV_INHIBIT_REASON_BLOCKIRQ, + /* + * APICv is disabled because not all vCPUs have a 1:1 mapping between + * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, + /* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e9f258de91bd..297da684c25f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -386,6 +386,16 @@ void kvm_recalculate_apic_map(struct kvm *kvm) cluster[ldr] = apic; } out: + /* + * The optimized map is effectively KVM's internal version of APICv, + * and all unwanted aliasing that results in disabling the optimized + * map also applies to APICv. + */ + if (!new) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); + old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); @@ -2158,7 +2168,8 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) /* * Deliberately truncate the vCPU ID when detecting a modified APIC ID * to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a 32-bit - * value. + * value. If the wrap/truncation results in unwanted aliasing, APICv + * will be inhibited as part of updating KVM's optimized APIC maps. */ if (kvm_xapic_id(apic) == (u8)apic->vcpu->vcpu_id) return; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 76da9f19272e..d1ac19f053ce 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -965,6 +965,7 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_PIT_REINJ) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8ed14fcfe870..ef84d11a0fe4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8029,6 +8029,7 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); -- cgit v1.2.3 From 9a364857ab4f8d59d417eca88a39ddf9b308237b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:57 +0000 Subject: KVM: SVM: Inhibit AVIC if vCPUs are aliased in logical mode Inhibit SVM's AVIC if multiple vCPUs are aliased to the same logical ID. Architecturally, all CPUs whose logical ID matches the MDA are supposed to receive the interrupt; overwriting existing entries in AVIC's logical=>physical map can result in missed IPIs. Fixes: 18f40c53e10f ("svm: Add VMEXIT handlers for AVIC") Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-25-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/lapic.c | 5 +++++ arch/x86/kvm/svm/avic.c | 3 ++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5865bad08a23..b41a01b3a4af 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1218,6 +1218,12 @@ enum kvm_apicv_inhibit { * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, + + /* + * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 + * mapping between logical ID and vCPU. + */ + APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, }; struct kvm_arch { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 297da684c25f..0faf80cdc1be 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -396,6 +396,11 @@ out: else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); + if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); + old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1ac19f053ce..1fd473a57159 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -967,7 +967,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); return supported & BIT(reason); } -- cgit v1.2.3 From 1ba59a44546797432d4c2fd2854281e3f4514d2d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:58 +0000 Subject: KVM: SVM: Always update local APIC on writes to logical dest register Update the vCPU's local (virtual) APIC on LDR writes even if the write "fails". The APIC needs to recalc the optimized logical map even if the LDR is invalid or zero, e.g. if the guest clears its LDR, the optimized map will be left as is and the vCPU will receive interrupts using its old LDR. Fixes: 18f40c53e10f ("svm: Add VMEXIT handlers for AVIC") Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-26-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1fd473a57159..144e383a4c5d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -573,7 +573,7 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } -static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) { int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); @@ -582,10 +582,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) /* AVIC does not support LDR update for x2APIC */ if (apic_x2apic_mode(vcpu->arch.apic)) - return 0; + return; if (ldr == svm->ldr_reg) - return 0; + return; avic_invalidate_logical_id_entry(vcpu); @@ -594,8 +594,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) if (!ret) svm->ldr_reg = ldr; - - return ret; } static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) @@ -617,8 +615,7 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) switch (offset) { case APIC_LDR: - if (avic_handle_ldr_update(vcpu)) - return 0; + avic_handle_ldr_update(vcpu); break; case APIC_DFR: avic_handle_dfr_update(vcpu); -- cgit v1.2.3 From 4f160b7bd481650b15c46808515c67281492ff1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:12:59 +0000 Subject: KVM: SVM: Update svm->ldr_reg cache even if LDR is "bad" Update SVM's cache of the LDR even if the new value is "bad". Leaving stale information in the cache can result in KVM missing updates and/or invalidating the wrong entry, e.g. if avic_invalidate_logical_id_entry() is triggered after a different vCPU has "claimed" the old LDR. Fixes: 18f40c53e10f ("svm: Add VMEXIT handlers for AVIC") Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-27-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 144e383a4c5d..5b33f91b701c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -539,7 +539,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) +static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -547,15 +547,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; entry = avic_get_logical_id_entry(vcpu, ldr, flat); if (!entry) - return -EINVAL; + return; new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); - - return 0; } static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) @@ -575,7 +573,6 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); @@ -589,11 +586,8 @@ static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); - if (ldr) - ret = avic_ldr_write(vcpu, id, ldr); - - if (!ret) - svm->ldr_reg = ldr; + svm->ldr_reg = ldr; + avic_ldr_write(vcpu, id, ldr); } static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 1808c950955dbbf8a0f83c96e720ec700863137e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:00 +0000 Subject: KVM: SVM: Require logical ID to be power-of-2 for AVIC entry Do not modify AVIC's logical ID table if the logical ID portion of the LDR is not a power-of-2, i.e. if the LDR has multiple bits set. Taking only the first bit means that KVM will fail to match MDAs that intersect with "higher" bits in the "ID" The "ID" acts as a bitmap, but is referred to as an ID because there's an implicit, unenforced "requirement" that software only set one bit. This edge case is arguably out-of-spec behavior, but KVM cleanly handles it in all other cases, e.g. the optimized logical map (and AVIC!) is also disabled in this scenario. Refactor the code to consolidate the checks, and so that the code looks more like avic_kick_target_vcpus_fast(). Fixes: 18f40c53e10f ("svm: Add VMEXIT handlers for AVIC") Cc: Suravee Suthikulpanit Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-28-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5b33f91b701c..eb979695e7d8 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -513,26 +513,26 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - int index; u32 *logical_apic_id_table; - int dlid = GET_APIC_LOGICAL_ID(ldr); + u32 cluster, index; - if (!dlid) - return NULL; - - if (flat) { /* flat */ - index = ffs(dlid) - 1; - if (index > 7) - return NULL; - } else { /* cluster */ - int cluster = (dlid & 0xf0) >> 4; - int apic = ffs(dlid & 0x0f) - 1; + ldr = GET_APIC_LOGICAL_ID(ldr); - if ((apic < 0) || (apic > 7) || - (cluster >= 0xf)) + if (flat) { + cluster = 0; + } else { + cluster = (ldr >> 4); + if (cluster >= 0xf) return NULL; - index = (cluster << 2) + apic; + ldr &= 0xf; } + if (!ldr || !is_power_of_2(ldr)) + return NULL; + + index = __ffs(ldr); + if (WARN_ON_ONCE(index > 7)) + return NULL; + index += (cluster << 2); logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); -- cgit v1.2.3 From bbfc7aa62a4a9baff264f1fd91175b6249843984 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:01 +0000 Subject: KVM: SVM: Handle multiple logical targets in AVIC kick fastpath Iterate over all target logical IDs in the AVIC kick fastpath instead of bailing if there is more than one target. Now that KVM inhibits AVIC if vCPUs aren't mapped 1:1 with logical IDs, each bit in the destination is guaranteed to match to at most one vCPU, i.e. iterating over the bitmap is guaranteed to kick each valid target exactly once. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-29-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 110 +++++++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eb979695e7d8..2c6737f72bd4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -327,6 +327,50 @@ static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) icrl & APIC_VECTOR_MASK); } +static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, + u32 icrl) +{ + /* + * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, + * i.e. APIC ID == vCPU ID. + */ + struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); + + /* Once again, nothing to do if the target vCPU doesn't exist. */ + if (unlikely(!target_vcpu)) + return; + + avic_kick_vcpu(target_vcpu, icrl); +} + +static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, + u32 logid_index, u32 icrl) +{ + u32 physical_id; + + if (avic_logical_id_table) { + u32 logid_entry = avic_logical_id_table[logid_index]; + + /* Nothing to do if the logical destination is invalid. */ + if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) + return; + + physical_id = logid_entry & + AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC, the logical APIC ID is a read-only value that is + * derived from the x2APIC ID, thus the x2APIC ID can be found + * by reversing the calculation (stored in logid_index). Note, + * bits 31:20 of the x2APIC ID aren't propagated to the logical + * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. + */ + physical_id = logid_index; + } + + avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); +} + /* * A fast-path version of avic_kick_target_vcpus(), which attempts to match * destination APIC ID to vCPU without looping through all vCPUs. @@ -334,11 +378,10 @@ static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { - u32 l1_physical_id, dest; - struct kvm_vcpu *target_vcpu; int dest_mode = icrl & APIC_DEST_MASK; int shorthand = icrl & APIC_SHORT_MASK; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + u32 dest; if (shorthand != APIC_DEST_NOSHORT) return -EINVAL; @@ -355,14 +398,14 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) return -EINVAL; - l1_physical_id = dest; - - if (WARN_ON_ONCE(l1_physical_id != index)) + if (WARN_ON_ONCE(dest != index)) return -EINVAL; + avic_kick_vcpu_by_physical_id(kvm, dest, icrl); } else { - u32 bitmap, cluster; - int logid_index; + u32 *avic_logical_id_table; + unsigned long bitmap, i; + u32 cluster; if (apic_x2apic_mode(source)) { /* 16 bit dest mask, 16 bit cluster id */ @@ -382,50 +425,21 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (unlikely(!bitmap)) return 0; - if (!is_power_of_2(bitmap)) - /* multiple logical destinations, use slow path */ - return -EINVAL; - - logid_index = cluster + __ffs(bitmap); - - if (apic_x2apic_mode(source)) { - /* - * For x2APIC, the logical APIC ID is a read-only value - * that is derived from the x2APIC ID, thus the x2APIC - * ID can be found by reversing the calculation (done - * above). Note, bits 31:20 of the x2APIC ID are not - * propagated to the logical ID, but KVM limits the - * x2APIC ID limited to KVM_MAX_VCPU_IDS. - */ - l1_physical_id = logid_index; - } else { - u32 *avic_logical_id_table = - page_address(kvm_svm->avic_logical_id_table_page); - - u32 logid_entry = avic_logical_id_table[logid_index]; - - if (WARN_ON_ONCE(index != logid_index)) - return -EINVAL; - - /* Nothing to do if the logical destination is invalid. */ - if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) - return 0; + if (apic_x2apic_mode(source)) + avic_logical_id_table = NULL; + else + avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); - l1_physical_id = logid_entry & - AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; - } + /* + * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical + * IDs, thus each bit in the destination is guaranteed to map + * to at most one vCPU. + */ + for_each_set_bit(i, &bitmap, 16) + avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, + cluster + i, icrl); } - /* - * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, - * i.e. APIC ID == vCPU ID. Once again, nothing to do if the target - * vCPU doesn't exist. - */ - target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); - if (unlikely(!target_vcpu)) - return 0; - - avic_kick_vcpu(target_vcpu, icrl); return 0; } -- cgit v1.2.3 From a790e338c7c4971f83e680e59a91d1b91263c8f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:02 +0000 Subject: KVM: SVM: Ignore writes to Remote Read Data on AVIC write traps Drop writes to APIC_RRR, a.k.a. Remote Read Data Register, on AVIC unaccelerated write traps. The register is read-only and isn't emulated by KVM. Sending the register through kvm_apic_write_nodecode() will result in screaming when x2APIC is enabled due to the unexpected failure to retrieve the MSR (KVM expects that only "legal" accesses will trap). Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20230106011306.85230-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2c6737f72bd4..ff08732469cb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -628,6 +628,9 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) case APIC_DFR: avic_handle_dfr_update(vcpu); break; + case APIC_RRR: + /* Ignore writes to Read Remote Data, it's read-only. */ + return 1; default: break; } -- cgit v1.2.3 From e2ed3e64a2bd2493469bec0ad205b594a98341a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:03 +0000 Subject: Revert "KVM: SVM: Do not throw warning when calling avic_vcpu_load on a running vcpu" Turns out that some warnings exist for good reasons. Restore the warning in avic_vcpu_load() that guards against calling avic_vcpu_load() on a running vCPU now that KVM avoids doing so when switching between x2APIC and xAPIC. The entire point of the WARN is to highlight that KVM should not be reloading an AVIC. Opportunistically convert the WARN_ON() to WARN_ON_ONCE() to avoid spamming the kernel if it does fire. This reverts commit c0caeee65af3944b7b8abbf566e7cc1fae15c775. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-31-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ff08732469cb..80f346b79c62 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1034,6 +1034,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); -- cgit v1.2.3 From b3f257a846960d06bdaa893dbee100189fbf2234 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:04 +0000 Subject: KVM: x86: Track required APICv inhibits with variable, not callback Track the per-vendor required APICv inhibits with a variable instead of calling into vendor code every time KVM wants to query the set of required inhibits. The required inhibits are a property of the vendor's virtualization architecture, i.e. are 100% static. Using a variable allows the compiler to inline the check, e.g. generate a single-uop TEST+Jcc, and thus eliminates any desire to avoid checking inhibits for performance reasons. No functional change intended. Reviewed-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-32-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 - arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/avic.c | 19 ------------------- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 16 +++++++++++++++- arch/x86/kvm/vmx/vmx.c | 24 +++++++++++------------- arch/x86/kvm/x86.c | 2 +- 7 files changed, 29 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index abccd51dcfca..84f43caef9b7 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -76,7 +76,6 @@ KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP_OPTIONAL(update_cr8_intercept) -KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b41a01b3a4af..7ca854714ccd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1623,6 +1623,7 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 80f346b79c62..14677bc31b83 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -963,25 +963,6 @@ out: return ret; } -bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_NESTED) | - BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_SEV) | - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | - BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); - - return supported & BIT(reason); -} - - static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f172f2de195..f2453df77727 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4773,8 +4773,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .update_cr8_intercept = svm_update_cr8_intercept, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, + .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 546825c82490..41eabb098b13 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -621,6 +621,21 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +#define AVIC_REQUIRED_APICV_INHIBITS \ +( \ + BIT(APICV_INHIBIT_REASON_DISABLE) | \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_NESTED) | \ + BIT(APICV_INHIBIT_REASON_IRQWIN) | \ + BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_SEV) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ +) bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); @@ -634,7 +649,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef84d11a0fe4..ad2ac66ef32e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8023,18 +8023,16 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - - return supported & BIT(reason); -} +#define VMX_REQUIRED_APICV_INHIBITS \ +( \ + BIT(APICV_INHIBIT_REASON_DISABLE)| \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ +) static void vmx_vm_destroy(struct kvm *kvm) { @@ -8118,7 +8116,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, - .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1abe3f1e821c..5becce5bd45a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10112,7 +10112,7 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason)) + if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason))) return; old = new = kvm->arch.apicv_inhibit_reasons; -- cgit v1.2.3 From d471bd853d38117c90ec02ecc51b3ca731674745 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Fri, 6 Jan 2023 01:13:05 +0000 Subject: KVM: x86: Allow APICv APIC ID inhibit to be cleared Legacy kernels prior to commit 4399c03c6780 ("x86/apic: Remove verify_local_APIC()") write the APIC ID of the boot CPU twice to verify a functioning local APIC. This results in APIC acceleration inhibited on these kernels for reason APICV_INHIBIT_REASON_APIC_ID_MODIFIED. Allow the APICV_INHIBIT_REASON_APIC_ID_MODIFIED inhibit reason to be cleared if/when all APICs in xAPIC mode set their APIC ID back to the expected vcpu_id value. Fold the functionality previously in kvm_lapic_xapic_id_updated() into kvm_recalculate_apic_map(), as this allows examining all APICs in one pass. Fixes: 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") Signed-off-by: Greg Edwards Link: https://lore.kernel.org/r/20221117183247.94314-1-gedwards@ddn.com Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0faf80cdc1be..856e81a2dc64 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -236,6 +236,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ + bool xapic_id_mismatch = false; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -285,6 +286,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) xapic_id = kvm_xapic_id(apic); x2apic_id = kvm_x2apic_id(apic); + /* + * Deliberately truncate the vCPU ID when detecting a mismatched + * APIC ID to avoid false positives if the vCPU ID, i.e. x2APIC + * ID, is a 32-bit value. Any unwanted aliasing due to + * truncation results will be detected below. + */ + if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) + xapic_id_mismatch = true; + /* * Apply KVM's hotplug hack if userspace has enable 32-bit APIC * IDs. Allow sending events to vCPUs by their x2APIC ID even @@ -401,6 +411,11 @@ out: else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); + if (xapic_id_mismatch) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); + old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); @@ -2160,28 +2175,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } -static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) -{ - struct kvm *kvm = apic->vcpu->kvm; - - if (!kvm_apic_hw_enabled(apic)) - return; - - if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm)) - return; - - /* - * Deliberately truncate the vCPU ID when detecting a modified APIC ID - * to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a 32-bit - * value. If the wrap/truncation results in unwanted aliasing, APICv - * will be inhibited as part of updating KVM's optimized APIC maps. - */ - if (kvm_xapic_id(apic) == (u8)apic->vcpu->vcpu_id) - return; - - kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); -} - static int get_lvt_index(u32 reg) { if (reg == APIC_LVTCMCI) @@ -2202,7 +2195,6 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) { kvm_apic_set_xapic_id(apic, val >> 24); - kvm_lapic_xapic_id_updated(apic); } else { ret = 1; } @@ -2923,9 +2915,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); - if (!apic_x2apic_mode(apic)) - kvm_lapic_xapic_id_updated(apic); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); -- cgit v1.2.3 From 72c70ceeaf59330b1d63380add768bdbdeb24b7c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Jan 2023 01:13:06 +0000 Subject: KVM: x86: Add helpers to recalc physical vs. logical optimized APIC maps Move the guts of kvm_recalculate_apic_map()'s main loop to two separate helpers to handle recalculating the physical and logical pieces of the optimized map. Having 100+ lines of code in the for-loop makes it hard to understand what is being calculated where. No functional change intended. Suggested-by: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20230106011306.85230-34-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 250 +++++++++++++++++++++++++++------------------------ 1 file changed, 133 insertions(+), 117 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 856e81a2dc64..669ea125b7e2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -218,6 +218,134 @@ static void kvm_apic_map_free(struct rcu_head *rcu) kvfree(map); } +static int kvm_recalculate_phys_map(struct kvm_apic_map *new, + struct kvm_vcpu *vcpu, + bool *xapic_id_mismatch) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 x2apic_id = kvm_x2apic_id(apic); + u32 xapic_id = kvm_xapic_id(apic); + u32 physical_id; + + /* + * Deliberately truncate the vCPU ID when detecting a mismatched APIC + * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a + * 32-bit value. Any unwanted aliasing due to truncation results will + * be detected below. + */ + if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) + *xapic_id_mismatch = true; + + /* + * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs. + * Allow sending events to vCPUs by their x2APIC ID even if the target + * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs + * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap + * and collide). + * + * Honor the architectural (and KVM's non-optimized) behavior if + * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed + * to process messages independently. If multiple vCPUs have the same + * effective APIC ID, e.g. due to the x2APIC wrap or because the guest + * manually modified its xAPIC IDs, events targeting that ID are + * supposed to be recognized by all vCPUs with said ID. + */ + if (vcpu->kvm->arch.x2apic_format) { + /* See also kvm_apic_match_physical_addr(). */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + } else { + /* + * Disable the optimized map if the physical APIC ID is already + * mapped, i.e. is aliased to multiple vCPUs. The optimized + * map requires a strict 1:1 mapping between IDs and vCPUs. + */ + if (apic_x2apic_mode(apic)) + physical_id = x2apic_id; + else + physical_id = xapic_id; + + if (new->phys_map[physical_id]) + return -EINVAL; + + new->phys_map[physical_id] = apic; + } + + return 0; +} + +static void kvm_recalculate_logical_map(struct kvm_apic_map *new, + struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + enum kvm_apic_logical_mode logical_mode; + struct kvm_lapic **cluster; + u16 mask; + u32 ldr; + + if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) + return; + + if (!kvm_apic_sw_enabled(apic)) + return; + + ldr = kvm_lapic_get_reg(apic, APIC_LDR); + if (!ldr) + return; + + if (apic_x2apic_mode(apic)) { + logical_mode = KVM_APIC_MODE_X2APIC; + } else { + ldr = GET_APIC_LOGICAL_ID(ldr); + if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) + logical_mode = KVM_APIC_MODE_XAPIC_FLAT; + else + logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; + } + + /* + * To optimize logical mode delivery, all software-enabled APICs must + * be configured for the same mode. + */ + if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { + new->logical_mode = logical_mode; + } else if (new->logical_mode != logical_mode) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + return; + } + + /* + * In x2APIC mode, the LDR is read-only and derived directly from the + * x2APIC ID, thus is guaranteed to be addressable. KVM reuses + * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by + * reversing the LDR calculation to get cluster of APICs, i.e. no + * additional work is required. + */ + if (apic_x2apic_mode(apic)) { + WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + return; + } + + if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, + &cluster, &mask))) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + return; + } + + if (!mask) + return; + + ldr = ffs(mask) - 1; + if (!is_power_of_2(mask) || cluster[ldr]) + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + else + cluster[ldr] = apic; +} + /* * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. * @@ -272,128 +400,16 @@ void kvm_recalculate_apic_map(struct kvm *kvm) new->logical_mode = KVM_APIC_MODE_SW_DISABLED; kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_lapic **cluster; - enum kvm_apic_logical_mode logical_mode; - u32 x2apic_id, physical_id; - u16 mask; - u32 ldr; - u8 xapic_id; - if (!kvm_apic_present(vcpu)) continue; - xapic_id = kvm_xapic_id(apic); - x2apic_id = kvm_x2apic_id(apic); - - /* - * Deliberately truncate the vCPU ID when detecting a mismatched - * APIC ID to avoid false positives if the vCPU ID, i.e. x2APIC - * ID, is a 32-bit value. Any unwanted aliasing due to - * truncation results will be detected below. - */ - if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) - xapic_id_mismatch = true; - - /* - * Apply KVM's hotplug hack if userspace has enable 32-bit APIC - * IDs. Allow sending events to vCPUs by their x2APIC ID even - * if the target vCPU is in legacy xAPIC mode, and silently - * ignore aliased xAPIC IDs (the x2APIC ID is truncated to 8 - * bits, causing IDs > 0xff to wrap and collide). - * - * Honor the architectural (and KVM's non-optimized) behavior - * if userspace has not enabled 32-bit x2APIC IDs. Each APIC - * is supposed to process messages independently. If multiple - * vCPUs have the same effective APIC ID, e.g. due to the - * x2APIC wrap or because the guest manually modified its xAPIC - * IDs, events targeting that ID are supposed to be recognized - * by all vCPUs with said ID. - */ - if (kvm->arch.x2apic_format) { - /* See also kvm_apic_match_physical_addr(). */ - if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && - x2apic_id <= new->max_apic_id) - new->phys_map[x2apic_id] = apic; - - if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) - new->phys_map[xapic_id] = apic; - } else { - /* - * Disable the optimized map if the physical APIC ID is - * already mapped, i.e. is aliased to multiple vCPUs. - * The optimized map requires a strict 1:1 mapping - * between IDs and vCPUs. - */ - if (apic_x2apic_mode(apic)) - physical_id = x2apic_id; - else - physical_id = xapic_id; - - if (new->phys_map[physical_id]) { - kvfree(new); - new = NULL; - goto out; - } - new->phys_map[physical_id] = apic; + if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + kvfree(new); + new = NULL; + goto out; } - if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED || - !kvm_apic_sw_enabled(apic)) - continue; - - ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (!ldr) - continue; - - if (apic_x2apic_mode(apic)) { - logical_mode = KVM_APIC_MODE_X2APIC; - } else { - ldr = GET_APIC_LOGICAL_ID(ldr); - if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) - logical_mode = KVM_APIC_MODE_XAPIC_FLAT; - else - logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; - } - - /* - * To optimize logical mode delivery, all software-enabled APICs must - * be configured for the same mode. - */ - if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { - new->logical_mode = logical_mode; - } else if (new->logical_mode != logical_mode) { - new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; - continue; - } - - /* - * In x2APIC mode, the LDR is read-only and derived directly - * from the x2APIC ID, thus is guaranteed to be addressable. - * KVM reuses kvm_apic_map.phys_map to optimize logical mode - * x2APIC interrupts by reversing the LDR calculation to get - * cluster of APICs, i.e. no additional work is required. - */ - if (apic_x2apic_mode(apic)) { - WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(x2apic_id)); - continue; - } - - if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, - &cluster, &mask))) { - new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; - continue; - } - - if (!mask) - continue; - - ldr = ffs(mask) - 1; - if (!is_power_of_2(mask) || cluster[ldr]) { - new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; - continue; - } - cluster[ldr] = apic; + kvm_recalculate_logical_map(new, vcpu); } out: /* -- cgit v1.2.3 From cecafc0a830f7ea89bc11c644e2841f603fcc4e6 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 5 Jan 2023 21:01:27 +0800 Subject: KVM: MMU: Make the definition of 'INVALID_GPA' common KVM already has a 'GPA_INVALID' defined as (~(gpa_t)0) in kvm_types.h, and it is used by ARM code. We do not need another definition of 'INVALID_GPA' for X86 specifically. Instead of using the common 'GPA_INVALID' for X86, replace it with 'INVALID_GPA', and change the users of 'GPA_INVALID' so that the diff can be smaller. Also because the name 'INVALID_GPA' tells the user we are using an invalid GPA, while the name 'GPA_INVALID' is emphasizing the GPA is an invalid one. No functional change intended. Signed-off-by: Yu Zhang Reviewed-by: Paul Durrant Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230105130127.866171-1-yu.c.zhang@linux.intel.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 4 ++-- arch/arm64/kvm/hypercalls.c | 2 +- arch/arm64/kvm/pvtime.c | 8 ++++---- arch/x86/include/asm/kvm_host.h | 2 -- include/linux/kvm_types.h | 2 +- 5 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..37766e57c8f2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -916,12 +916,12 @@ void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { - vcpu_arch->steal.base = GPA_INVALID; + vcpu_arch->steal.base = INVALID_GPA; } static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) { - return (vcpu_arch->steal.base != GPA_INVALID); + return (vcpu_arch->steal.base != INVALID_GPA); } void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index c9f401fa01a9..64c086c02c60 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -198,7 +198,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); - if (gpa != GPA_INVALID) + if (gpa != INVALID_GPA) val[0] = gpa; break; case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 78a09f7a6637..4ceabaa4c30b 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,7 +19,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return; idx = srcu_read_lock(&kvm->srcu); @@ -40,7 +40,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) val = SMCCC_RET_SUCCESS; break; } @@ -54,7 +54,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return base; /* @@ -89,7 +89,7 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, return -EFAULT; if (!IS_ALIGNED(ipa, 64)) return -EINVAL; - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) return -EEXIST; /* Check the address is in a valid memslot */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..46e50cb6c9ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -134,8 +134,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define INVALID_GPA (~(gpa_t)0) - /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 76de36e56cdf..2728d49bbdf6 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -40,7 +40,7 @@ typedef unsigned long gva_t; typedef u64 gpa_t; typedef u64 gfn_t; -#define GPA_INVALID (~(gpa_t)0) +#define INVALID_GPA (~(gpa_t)0) typedef unsigned long hva_t; typedef u64 hpa_t; -- cgit v1.2.3 From 85e64d09f7934166b77dfbe42eec600c93703cb3 Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Wed, 23 Nov 2022 17:04:45 +0800 Subject: KVM: x86: remove redundant ret variable Return value from apic_get_tmcct() directly instead of taking this in another redundant variable. Signed-off-by: zhang songyi Link: https://lore.kernel.org/r/202211231704457807160@zte.com.cn Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7cf4eebc9bcc..08450b3e7040 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1487,7 +1487,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) { ktime_t remaining, now; s64 ns; - u32 tmcct; ASSERT(apic != NULL); @@ -1502,10 +1501,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); - tmcct = div64_u64(ns, - (APIC_BUS_CYCLE_NS * apic->divide_count)); - - return tmcct; + return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) -- cgit v1.2.3 From ba5838abb05334e4abfdff1490585c7f365e0424 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:20 +0000 Subject: KVM: x86: Inject #GP if WRMSR sets reserved bits in APIC Self-IPI Inject a #GP if the guest attempts to set reserved bits in the x2APIC-only Self-IPI register. Bits 7:0 hold the vector, all other bits are reserved. Reported-by: Marc Orr Cc: Ben Gardon Cc: Venkatesh Srinivas Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20230107011025.565472-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 08450b3e7040..9aca006b2d22 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2324,10 +2324,14 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_SELF_IPI: - if (apic_x2apic_mode(apic)) - kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0); - else + /* + * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold + * the vector, everything else is reserved. + */ + if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK)) ret = 1; + else + kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0); break; default: ret = 1; -- cgit v1.2.3 From ab52be1b310bcb39e6745d34a8f0e8475d67381a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:21 +0000 Subject: KVM: x86: Inject #GP on x2APIC WRMSR that sets reserved bits 63:32 Reject attempts to set bits 63:32 for 32-bit x2APIC registers, i.e. all x2APIC registers except ICR. Per Intel's SDM: Non-zero writes (by WRMSR instruction) to reserved bits to these registers will raise a general protection fault exception Opportunistically fix a typo in a nearby comment. Reported-by: Marc Orr Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20230107011025.565472-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9aca006b2d22..814b65106057 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -3114,13 +3114,17 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data) { /* - * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and + * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and * can be written as such, all other registers remain accessible only * through 32-bit reads/writes. */ if (reg == APIC_ICR) return kvm_x2apic_icr_write(apic, data); + /* Bits 63:32 are reserved in all other registers. */ + if (data >> 32) + return 1; + return kvm_lapic_reg_write(apic, reg, (u32)data); } -- cgit v1.2.3 From b223649576fc21bec46260805c2cd70e1cb3b8e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:22 +0000 Subject: KVM: x86: Mark x2APIC DFR reg as non-existent for x2APIC Mark APIC_DFR as being invalid/non-existent in x2APIC mode instead of handling it as a one-off check in kvm_x2apic_msr_read(). This will allow reusing "valid_reg_mask" to generate VMX's interception bitmaps for x2APIC. Handling DFR in the common read path may also fix the Hyper-V PV MSR interface, if that can coexist with x2APIC. Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20230107011025.565472-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 814b65106057..91e6f958d4b2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1573,7 +1573,6 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TASKPRI) | APIC_REG_MASK(APIC_PROCPRI) | APIC_REG_MASK(APIC_LDR) | - APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_SPIV) | APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | @@ -1594,12 +1593,13 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); /* - * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR - * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be - * manually handled by the caller. + * ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. WARN if KVM + * reads ICR in x2APIC mode as it's an 8-byte register in x2APIC and + * needs to be manually handled by the caller. */ if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | + APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_ICR2); else WARN_ON_ONCE(offset == APIC_ICR); @@ -3147,9 +3147,6 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_DFR) - return 1; - return kvm_lapic_msr_read(apic, reg, data); } -- cgit v1.2.3 From b5fcc59be72a76b5cf7bcc6d4aba6cdb14557d44 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:23 +0000 Subject: KVM: x86: Split out logic to generate "readable" APIC regs mask to helper Move the generation of the readable APIC regs bitmask to a standalone helper so that VMX can use the mask for its MSR interception bitmaps. No functional change intended. Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20230107011025.565472-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 34 +++++++++++++++++++++------------- arch/x86/kvm/lapic.h | 2 ++ 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 91e6f958d4b2..d2ad5e8b63a4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1561,12 +1561,9 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) -static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, - void *data) +u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { - unsigned char alignment = offset & 0xf; - u32 result; - /* this bitmask has a bit cleared for each reserved register */ + /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = APIC_REG_MASK(APIC_ID) | APIC_REG_MASK(APIC_LVR) | @@ -1592,22 +1589,33 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); - /* - * ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. WARN if KVM - * reads ICR in x2APIC mode as it's an 8-byte register in x2APIC and - * needs to be manually handled by the caller. - */ + /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */ if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_ICR2); - else - WARN_ON_ONCE(offset == APIC_ICR); + + return valid_reg_mask; +} +EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask); + +static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) +{ + unsigned char alignment = offset & 0xf; + u32 result; + + /* + * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in + * x2APIC and needs to be manually handled by the caller. + */ + WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR); if (alignment + len > 4) return 1; - if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) + if (offset > 0x3f0 || + !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset))) return 1; result = __apic_read(apic, offset & ~0xf); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index df316ede7546..0a0ea4b5dd8c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -146,6 +146,8 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); +u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic); + #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) -- cgit v1.2.3 From c39857ce8daaaa429ccae2a393301ffeed67e235 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:24 +0000 Subject: KVM: VMX: Always intercept accesses to unsupported "extended" x2APIC regs Don't clear the "read" bits for x2APIC registers above SELF_IPI (APIC regs 0x400 - 0xff0, MSRs 0x840 - 0x8ff). KVM doesn't emulate registers in that space (there are a smattering of AMD-only extensions) and so should intercept reads in order to inject #GP. When APICv is fully enabled, Intel hardware doesn't validate the registers on RDMSR and instead blindly retrieves data from the vAPIC page, i.e. it's software's responsibility to intercept reads to non-existent MSRs. Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support") Reviewed-by: Maxim Levitsky Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20230107011025.565472-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c788aa382611..82c61c16f8f5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4018,26 +4018,17 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) -{ - unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - unsigned long read_intercept; - int msr; - - read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned int read_idx = msr / BITS_PER_LONG; - unsigned int write_idx = read_idx + (0x800 / sizeof(long)); - - msr_bitmap[read_idx] = read_intercept; - msr_bitmap[write_idx] = ~0ul; - } -} - static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + /* + * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves + * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, + * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. + */ + const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; + const int write_idx = read_idx + (0x800 / sizeof(u64)); struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; u8 mode; if (!cpu_has_vmx_msr_bitmap()) @@ -4058,7 +4049,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx->x2apic_msr_bitmap_mode = mode; - vmx_reset_x2apic_msrs(vcpu, mode); + /* + * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended + * registers (0x840 and above) intercepted, KVM doesn't support them. + * Intercept all writes by default and poke holes as needed. Pass + * through all reads by default in x2APIC+APICv mode, as all registers + * except the current timer count are passed through for read. + */ + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) + msr_bitmap[read_idx] = 0; + else + msr_bitmap[read_idx] = ~0ull; + msr_bitmap[write_idx] = ~0ull; /* * TPR reads and writes can be virtualized even if virtual interrupt -- cgit v1.2.3 From 02efd818a6c095bcbf422f47fccc4ef27d53f344 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 7 Jan 2023 01:10:25 +0000 Subject: KVM: VMX: Intercept reads to invalid and write-only x2APIC registers Intercept reads to invalid (non-existent) and write-only x2APIC registers when configuring VMX's MSR bitmaps for x2APIC+APICv. When APICv is fully enabled, Intel hardware doesn't validate the registers on RDMSR and instead blindly retrieves data from the vAPIC page, i.e. it's software's responsibility to intercept reads to non-existent and write-only MSRs. Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support") Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20230107011025.565472-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 82c61c16f8f5..1be2bc7185be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4031,7 +4031,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; u8 mode; - if (!cpu_has_vmx_msr_bitmap()) + if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) return; if (cpu_has_secondary_exec_ctrls() && @@ -4053,11 +4053,11 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended * registers (0x840 and above) intercepted, KVM doesn't support them. * Intercept all writes by default and poke holes as needed. Pass - * through all reads by default in x2APIC+APICv mode, as all registers - * except the current timer count are passed through for read. + * through reads for all valid registers by default in x2APIC+APICv + * mode, only the current timer count needs on-demand emulation by KVM. */ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) - msr_bitmap[read_idx] = 0; + msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); else msr_bitmap[read_idx] = ~0ull; msr_bitmap[write_idx] = ~0ull; -- cgit v1.2.3 From 8e6ed96cdd5001c55fccc80a17f651741c1ca7d2 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 6 Jan 2023 12:06:25 +0800 Subject: KVM: x86: fire timer when it is migrated and expired, and in oneshot mode when the vCPU was migrated, if its timer is expired, KVM _should_ fire the timer ASAP, zeroing the deadline here will cause the timer to immediately fire on the destination Cc: Sean Christopherson Cc: Peter Shier Cc: Jim Mattson Cc: Wanpeng Li Cc: Paolo Bonzini Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20230106040625.8404-1-lirongqing@baidu.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d2ad5e8b63a4..c28ba0275580 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1968,8 +1968,12 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) if (unlikely(count_reg != APIC_TMICT)) { deadline = tmict_to_ns(apic, kvm_lapic_get_reg(apic, count_reg)); - if (unlikely(deadline <= 0)) - deadline = apic->lapic_timer.period; + if (unlikely(deadline <= 0)) { + if (apic_lvtt_period(apic)) + deadline = apic->lapic_timer.period; + else + deadline = 0; + } else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( "vcpu %i: requested lapic timer restore with " -- cgit v1.2.3 From f8df91e73a6827a4569bb56cd53e55b4ea2f5b1f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 1 Sep 2022 14:18:06 -0700 Subject: x86/cpufeatures: Add macros for Intel's new fast rep string features KVM_GET_SUPPORTED_CPUID should reflect these host CPUID bits. The bits are already cached in word 12. Give the bits X86_FEATURE names, so that they can be easily referenced. Hide these bits from /proc/cpuinfo, since the host kernel makes no use of them at present. Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220901211811.2883855-1-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 61012476d66e..cdb7e1492311 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -312,6 +312,9 @@ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ -- cgit v1.2.3 From 2a4209d6a9cb51082813e3081e1172ae68d27935 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 1 Sep 2022 14:18:07 -0700 Subject: KVM: x86: Advertise fast REP string features inherent to the CPU Fast zero-length REP MOVSB, fast short REP STOSB, and fast short REP {CMPSB,SCASB} are inherent features of the processor that cannot be hidden by the hypervisor. When these features are present on the host, enumerate them in KVM_GET_SUPPORTED_CPUID. Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220901211811.2883855-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2a9f1e200dbc..d37cdab18715 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -664,8 +664,9 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) | - F(AVX_IFMA) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | + F(FZRM) | F(FSRS) | F(FSRC) | + F(AMX_FP16) | F(AVX_IFMA) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, -- cgit v1.2.3 From 6213b701a9df047242e69672f56eba62ba0e2d8a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 18 Jan 2023 11:59:09 -0800 Subject: KVM: x86: Replace 0-length arrays with flexible arrays Zero-length arrays are deprecated[1]. Replace struct kvm_nested_state's "data" union 0-length arrays with flexible arrays. (How are the sizes of these arrays verified?) Detected with GCC 13, using -fstrict-flex-arrays=3: arch/x86/kvm/svm/nested.c: In function 'svm_get_nested_state': arch/x86/kvm/svm/nested.c:1536:17: error: array subscript 0 is outside array bounds of 'struct kvm_svm_nested_state_data[0]' [-Werror=array-bounds=] 1536 | &user_kvm_nested_state->data.svm[0]; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from include/uapi/linux/kvm.h:15, from include/linux/kvm_host.h:40, from arch/x86/kvm/svm/nested.c:18: arch/x86/include/uapi/asm/kvm.h:511:50: note: while referencing 'svm' 511 | struct kvm_svm_nested_state_data svm[0]; | ^~~ [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "Gustavo A. R. Silva" Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: kvm@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230105190548.never.323-kees@kernel.org Link: https://lore.kernel.org/r/20230118195905.gonna.693-kees@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e48deab8901d..bde47f3a8c9d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,7 @@ #include #include +#include #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -507,8 +508,8 @@ struct kvm_nested_state { * KVM_{GET,PUT}_NESTED_STATE ioctl values. */ union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; + __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx); + __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm); } data; }; -- cgit v1.2.3 From ee661d8ea94e11b3452cd27c301241f2cfa95fca Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 5 Jan 2023 13:43:03 -0800 Subject: KVM: x86: Replace cpu_dirty_logging_count with nr_memslots_dirty_logging Drop cpu_dirty_logging_count in favor of nr_memslots_dirty_logging. Both fields count the number of memslots that have dirty-logging enabled, with the only difference being that cpu_dirty_logging_count is only incremented when using PML. So while nr_memslots_dirty_logging is not a direct replacement for cpu_dirty_logging_count, it can be combined with enable_pml to get the same information. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20230105214303.2919415-1-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx/vmx.c | 9 ++++++--- arch/x86/kvm/x86.c | 8 +++----- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d2bc08794e4..960f88ca9ce9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1327,7 +1327,6 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; - int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c788aa382611..bbf60bda877e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4606,7 +4606,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ - if (!vcpu->kvm->arch.cpu_dirty_logging_count) + if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -7988,17 +7988,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(!enable_pml)) + return; + if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* - * Note, cpu_dirty_logging_count can be changed concurrent with this + * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ - if (vcpu->kvm->arch.cpu_dirty_logging_count) + if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 508074e47bc0..fc3ae3ccd005 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12488,16 +12488,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { - struct kvm_arch *ka = &kvm->arch; + int nr_slots; if (!kvm_x86_ops.cpu_dirty_log_size) return; - if ((enable && ++ka->cpu_dirty_logging_count == 1) || - (!enable && --ka->cpu_dirty_logging_count == 0)) + nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); + if ((enable && nr_slots == 1) || !nr_slots) kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); - - WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, -- cgit v1.2.3 From 48639df8a9e3a1dfcb5880f77556681bd1e53657 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Fri, 6 Jan 2023 10:35:59 +0000 Subject: KVM: x86/cpuid: generalize kvm_update_kvm_cpuid_base() and also capture limit A subsequent patch will need to acquire the CPUID leaf range for emulated Xen so explicitly pass the signature of the hypervisor we're interested in to the new function. Also introduce a new kvm_hypervisor_cpuid structure so we can neatly store both the base and limit leaf indices. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20230106103600.528-2-pdurrant@amazon.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/cpuid.c | 24 +++++++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 960f88ca9ce9..0e5da33999c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -678,6 +678,11 @@ struct kvm_vcpu_hv { } nested; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -826,7 +831,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + struct kvm_hypervisor_cpuid kvm_cpuid; u64 reserved_gpa_bits; int maxphyaddr; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d37cdab18715..c5c6eefd15c2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -181,15 +181,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { - u32 function; + struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; + u32 base; - vcpu->arch.kvm_cpuid_base = 0; - - for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function); + for_each_possible_hypervisor_cpuid_base(base) { + entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; @@ -198,19 +198,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) signature[1] = entry->ecx; signature[2] = entry->edx; - BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); - if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { - vcpu->arch.kvm_cpuid_base = function; + if (!memcmp(signature, sig, sizeof(signature))) { + cpuid.base = base; + cpuid.limit = entry->eax; break; } } } + + return cpuid; } static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { - u32 base = vcpu->arch.kvm_cpuid_base; + u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; @@ -440,7 +442,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); + vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; -- cgit v1.2.3 From f422f853af0369be27d2a9f1b20079f2bc3d1ca2 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Fri, 6 Jan 2023 10:36:00 +0000 Subject: KVM: x86/xen: update Xen CPUID Leaf 4 (tsc info) sub-leaves, if present The scaling information in subleaf 1 should match the values set by KVM in the 'vcpu_info' sub-structure 'time_info' (a.k.a. pvclock_vcpu_time_info) which is shared with the guest, but is not directly available to the VMM. The offset values are not set since a TSC offset is already applied. The TSC frequency should also be set in sub-leaf 2. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20230106103600.528-3-pdurrant@amazon.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/xen/hypervisor.h | 4 +++- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/x86.c | 1 + arch/x86/kvm/xen.c | 26 ++++++++++++++++++++++++++ arch/x86/kvm/xen.h | 7 +++++++ 6 files changed, 40 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0e5da33999c6..8b38a4cb2e29 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -703,6 +703,7 @@ struct kvm_vcpu_xen { struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; + struct kvm_hypervisor_cpuid cpuid; }; struct kvm_queued_exception { diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 16f548a661cf..5fc35f889cd1 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -38,9 +38,11 @@ extern struct start_info *xen_start_info; #include +#define XEN_SIGNATURE "XenVMMXenVMM" + static inline uint32_t xen_cpuid_base(void) { - return hypervisor_cpuid_base("XenVMMXenVMM", 2); + return hypervisor_cpuid_base(XEN_SIGNATURE, 2); } struct pci_dev; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c5c6eefd15c2..8f8edeaf8177 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -26,6 +26,7 @@ #include "mmu.h" #include "trace.h" #include "pmu.h" +#include "xen.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be @@ -443,6 +444,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); + vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc3ae3ccd005..65b401e94cec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3161,6 +3161,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; + kvm_xen_update_tsc_info(v); } vcpu->hv_clock.tsc_timestamp = tsc_timestamp; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2681e2007e39..40edf4d1974c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -23,6 +23,9 @@ #include #include +#include + +#include "cpuid.h" #include "trace.h" static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); @@ -2077,6 +2080,29 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) del_timer_sync(&vcpu->arch.xen.poll_timer); } +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + u32 function; + + if (!vcpu->arch.xen.cpuid.base) + return; + + function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); + if (function > vcpu->arch.xen.cpuid.limit) + return; + + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); + if (entry) { + entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; + entry->edx = vcpu->arch.hv_clock.tsc_shift; + } + + entry = kvm_find_cpuid_entry_index(vcpu, function, 2); + if (entry) + entry->eax = vcpu->arch.hw_tsc_khz; +} + void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index ea33d80a0c51..f8f1fe22d090 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,8 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include + #ifdef CONFIG_KVM_XEN #include @@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { @@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) { return false; } + +static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ +} #endif int kvm_xen_hypercall(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 26044aff37a5455b19a91785086914fd33053ef4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:36:47 +0000 Subject: x86/crash: Disable virt in core NMI crash handler to avoid double shootdown Disable virtualization in crash_nmi_callback() and rework the emergency_vmx_disable_all() path to do an NMI shootdown if and only if a shootdown has not already occurred. NMI crash shootdown fundamentally can't support multiple invocations as responding CPUs are deliberately put into halt state without unblocking NMIs. But, the emergency reboot path doesn't have any work of its own, it simply cares about disabling virtualization, i.e. so long as a shootdown occurred, emergency reboot doesn't care who initiated the shootdown, or when. If "crash_kexec_post_notifiers" is specified on the kernel command line, panic() will invoke crash_smp_send_stop() and result in a second call to nmi_shootdown_cpus() during native_machine_emergency_restart(). Invoke the callback _before_ disabling virtualization, as the current VMCS needs to be cleared before doing VMXOFF. Note, this results in a subtle change in ordering between disabling virtualization and stopping Intel PT on the responding CPUs. While VMX and Intel PT do interact, VMXOFF and writes to MSR_IA32_RTIT_CTL do not induce faults between one another, which is all that matters when panicking. Harden nmi_shootdown_cpus() against multiple invocations to try and capture any such kernel bugs via a WARN instead of hanging the system during a crash/dump, e.g. prior to the recent hardening of register_nmi_handler(), re-registering the NMI handler would trigger a double list_add() and hang the system if CONFIG_BUG_ON_DATA_CORRUPTION=y. list_add double add: new=ffffffff82220800, prev=ffffffff8221cfe8, next=ffffffff82220800. WARNING: CPU: 2 PID: 1319 at lib/list_debug.c:29 __list_add_valid+0x67/0x70 Call Trace: __register_nmi_handler+0xcf/0x130 nmi_shootdown_cpus+0x39/0x90 native_machine_emergency_restart+0x1c9/0x1d0 panic+0x237/0x29b Extract the disabling logic to a common helper to deduplicate code, and to prepare for doing the shootdown in the emergency reboot path if SVM is supported. Note, prior to commit ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported"), nmi_shootdown_cpus() was subtly protected against a second invocation by a cpu_vmx_enabled() check as the kdump handler would disable VMX if it ran first. Fixes: ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported") Cc: stable@vger.kernel.org Reported-by: Guilherme G. Piccoli Cc: Vitaly Kuznetsov Cc: Paolo Bonzini Link: https://lore.kernel.org/all/20220427224924.592546-2-gpiccoli@igalia.com Tested-by: Guilherme G. Piccoli Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221130233650.1404148-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/reboot.h | 2 ++ arch/x86/kernel/crash.c | 17 +---------- arch/x86/kernel/reboot.c | 65 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 56 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 04c17be9b5fd..bc5b4d788c08 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +void cpu_emergency_disable_virtualization(void); + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_panic_self_stop(struct pt_regs *regs); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 305514431f26..cdd92ab43cda 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Disable VMX or SVM if needed. - * - * We need to disable virtualization on all CPUs. - * Having VMX or SVM enabled on any CPU may break rebooting - * after the kdump kernel has finished its task. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); - /* * Disable Intel PT to stop its logging */ @@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Booting kdump kernel with VMX or SVM enabled won't work, - * because (among other limitations) we can't disable paging - * with the virt flags. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); + cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c3636ea4aa71..374c14b3a9bf 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -528,10 +528,7 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct pt_regs *regs) -{ - cpu_emergency_vmxoff(); -} +static inline void nmi_shootdown_cpus_on_restart(void); /* Use NMIs as IPIs to tell all CPUs to disable virtualization */ static void emergency_vmx_disable_all(void) @@ -554,7 +551,7 @@ static void emergency_vmx_disable_all(void) __cpu_emergency_vmxoff(); /* Halt and exit VMX root operation on the other CPUs. */ - nmi_shootdown_cpus(vmxoff_nmi); + nmi_shootdown_cpus_on_restart(); } } @@ -795,6 +792,17 @@ void machine_crash_shutdown(struct pt_regs *regs) /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; @@ -817,7 +825,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, regs); + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -828,18 +843,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -/* - * Halt all other CPUs, calling the specified function on each of them +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. * - * This function can be used to halt all other CPUs on crash - * or emergency reboot time. The function passed as parameter - * will be called inside a NMI handler on all CPUs. + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); @@ -867,7 +896,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) msecs--; } - /* Leave the nmi callback set */ + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); } /* @@ -897,6 +936,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* No other CPUs to shoot down */ } +static inline void nmi_shootdown_cpus_on_restart(void) { } + void run_crash_ipi_callback(struct pt_regs *regs) { } -- cgit v1.2.3 From 6a3236580b0b1accc3976345e723104f74f6f8e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:36:48 +0000 Subject: x86/virt: Force GIF=1 prior to disabling SVM (for reboot flows) Set GIF=1 prior to disabling SVM to ensure that INIT is recognized if the kernel is disabling SVM in an emergency, e.g. if the kernel is about to jump into a crash kernel or may reboot without doing a full CPU RESET. If GIF is left cleared, the new kernel (or firmware) will be unabled to awaken APs. Eat faults on STGI (due to EFER.SVME=0) as it's possible that SVM could be disabled via NMI shootdown between reading EFER.SVME and executing STGI. Link: https://lore.kernel.org/all/cbcb6f35-e5d7-c1c9-4db9-fe5cc4de579a@amd.com Cc: stable@vger.kernel.org Cc: Andrew Cooper Cc: Tom Lendacky Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221130233650.1404148-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 8757078d4442..3b12e6b99412 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void) wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM to ensure INIT and NMI + * aren't blocked, e.g. if a fatal error occurred between CLGI + * and STGI. Note, STGI may #UD if SVM is disabled from NMI + * context between reading EFER and executing STGI. In that + * case, GIF must already be set, otherwise the NMI would have + * been blocked, so just eat the fault. + */ + asm_volatile_goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); +fault: + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } } /** Makes sure SVM is disabled, if it is supported on the CPU -- cgit v1.2.3 From d81f952aa657b76cea381384bef1fea35c5fd266 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:36:49 +0000 Subject: x86/reboot: Disable virtualization in an emergency if SVM is supported Disable SVM on all CPUs via NMI shootdown during an emergency reboot. Like VMX, SVM can block INIT, e.g. if the emergency reboot is triggered between CLGI and STGI, and thus can prevent bringing up other CPUs via INIT-SIPI-SIPI. Cc: stable@vger.kernel.org Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221130233650.1404148-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/reboot.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 374c14b3a9bf..d03c551defcc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -530,27 +530,26 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ -static void emergency_vmx_disable_all(void) +static void emergency_reboot_disable_virtualization(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* - * Disable VMX on all CPUs before rebooting, otherwise we risk hanging - * the machine, because the CPU blocks INIT when it's in VMX root. + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. * * We can't take any locks and we may be on an inconsistent state, so - * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if VMX if off on _this_ CPU, as that - * doesn't prevent a different CPU from being in VMX root operation. + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx()) { - /* Safely force _this_ CPU out of VMX root operation. */ - __cpu_emergency_vmxoff(); + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); - /* Halt and exit VMX root operation on the other CPUs. */ + /* Disable VMX/SVM and halt on other CPUs. */ nmi_shootdown_cpus_on_restart(); } } @@ -587,7 +586,7 @@ static void native_machine_emergency_restart(void) unsigned short mode; if (reboot_emergency) - emergency_vmx_disable_all(); + emergency_reboot_disable_virtualization(); tboot_shutdown(TB_SHUTDOWN_REBOOT); -- cgit v1.2.3 From a2b07fa7b93321c059af0c6d492cc9a4f1e390aa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:36:50 +0000 Subject: x86/reboot: Disable SVM, not just VMX, when stopping CPUs Disable SVM and more importantly force GIF=1 when halting a CPU or rebooting the machine. Similar to VMX, SVM allows software to block INITs via CLGI, and thus can be problematic for a crash/reboot. The window for failure is smaller with SVM as INIT is only blocked while GIF=0, i.e. between CLGI and STGI, but the window does exist. Fixes: fba4f472b33a ("x86/reboot: Turn off KVM when halting a CPU") Cc: stable@vger.kernel.org Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221130233650.1404148-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 06db901fabe8..375b33ecafa2 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* * Some notes on x86 processor bugs affecting SMP operation: @@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { ack_APIC_irq(); - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } -- cgit v1.2.3 From dc1ae59fc4310bdb54d495bdad180b46f646a9ea Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 5 Jan 2023 18:02:03 +0800 Subject: kvm: x86/mmu: Rename SPTE_TDP_AD_ENABLED_MASK to SPTE_TDP_AD_ENABLED SPTE_TDP_AD_ENABLED_MASK, SPTE_TDP_AD_DISABLED_MASK and SPTE_TDP_AD_WRPROT_ONLY_MASK are actual value, not mask. Remove "MASK" from their names. Signed-off-by: Lai Jiangshan Link: https://lore.kernel.org/r/20230105100204.6521-1-jiangshanlai@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/spte.c | 6 +++--- arch/x86/kvm/mmu/spte.h | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index fce6f047399f..c15bfca3ed15 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, WARN_ON_ONCE(!pte_access && !shadow_present_mask); if (sp->role.ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) - spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -317,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0d8deefee66c..1279db2eab44 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -28,10 +28,10 @@ */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) -static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); +#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -266,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); /* - * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline u64 spte_shadow_accessed_mask(u64 spte) -- cgit v1.2.3 From 9e3fbdfd9b894f15c52965bbc1696a65d377f3cd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 5 Jan 2023 18:03:10 +0800 Subject: kvm: x86/mmu: Don't clear write flooding for direct SP Although there is no harm, but there is no point to clear write flooding for direct SP. Signed-off-by: Lai Jiangshan Link: https://lore.kernel.org/r/20230105100310.6700-1-jiangshanlai@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e5662dbd519c..0160b502784a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -642,12 +642,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, fault->addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { + for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); + if (it.level == gw->level) + break; table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; @@ -692,8 +692,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { - clear_sp_write_flooding_count(it.sptep); - /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. -- cgit v1.2.3 From a7e48ef77ff27ba2647edec271b302804a0b14c3 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 28 Nov 2022 21:47:09 +0000 Subject: KVM: x86/mmu: fix an incorrect comment in kvm_mmu_new_pgd() There is no function named kvm_mmu_ensure_valid_pgd(). Fix the comment and remove the pair of braces to conform to Linux kernel coding style. Signed-off-by: Wei Liu Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221128214709.224710-1-wei.liu@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index aeb240b339f5..d43952fb05a6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4556,10 +4556,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; - if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { - /* kvm_mmu_ensure_valid_pgd will set up a new root. */ + /* + * Return immediately if no usable root was found, kvm_mmu_reload() + * will establish a valid root prior to the next VM-Enter. + */ + if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; - } /* * It's possible that the cached previous root page is obsolete because -- cgit v1.2.3 From c667a3baeddcc982bf512c126aec8d0f04adecfe Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:12 +0800 Subject: KVM: x86/mmu: Move round_gfn_for_level() helper into mmu_internal.h Rounding down the GFN to a huge page size is a common pattern throughout KVM, so move round_gfn_for_level() helper in tdp_iter.c to mmu_internal.h for common usage. Also rename it as gfn_round_for_level() to use gfn_* prefix and clean up the other call sites. Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/415c64782f27444898db650e21cf28eeb6441dfa.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 5 +++-- arch/x86/kvm/mmu/mmu_internal.h | 5 +++++ arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/mmu/tdp_iter.c | 11 +++-------- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d43952fb05a6..f3bef56a89a3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3169,7 +3169,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; @@ -4440,7 +4440,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = fault->gfn & ~(page_num - 1); + gfn_t base = gfn_round_for_level(fault->gfn, + fault->max_level); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ac00bfbf32f6..486be0f9fe16 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -156,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } +static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0160b502784a..faf193e97061 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -699,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e26e744df1d1..d2eb0d4f8710 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -16,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } -static gfn_t round_gfn_for_level(gfn_t gfn, int level) -{ - return gfn & -KVM_PAGES_PER_HPAGE(level); -} - /* * Return the TDP iterator to the root PT and allow it to continue its * traversal over the paging structure from there. @@ -31,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter) iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; @@ -98,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -140,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter) return false; iter->level++; - iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; -- cgit v1.2.3 From 9ffe9265375cbaf6c01647e31ae9fee8595b698c Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:13 +0800 Subject: KVM: x86/mmu: Fix wrong gfn range of tlb flushing in kvm_set_pte_rmapp() When the spte of hupe page is dropped in kvm_set_pte_rmapp(), the whole gfn range covered by the spte should be flushed. However, rmap_walk_init_level() doesn't align down the gfn for new level like tdp iterator does, then the gfn used in kvm_set_pte_rmapp() is not the base gfn of huge page. And the size of gfn range is wrong too for huge page. Use the base gfn of huge page and the size of huge page for flushing tlbs for huge page. Also introduce a helper function to flush the given page (huge or not) of guest memory, which would help prevent future buggy use of kvm_flush_remote_tlbs_with_address() in such case. Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.") Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/0ce24d7078fa5f1f8d64b0c59826c50f32f8065e.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmu_internal.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f3bef56a89a3..52be98e0102a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1469,7 +1469,7 @@ restart: } if (need_flush && kvm_available_flush_tlb_with_range()) { - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, level); return false; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 486be0f9fe16..cc58631e2336 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -169,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); + void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); + +/* Flush the given page (huge or not) of guest memory. */ +static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) +{ + kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); +} + unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -- cgit v1.2.3 From 1e203847aa9245bd782d6dc904ece124ca1b89cb Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:14 +0800 Subject: KVM: x86/mmu: Reduce gfn range of tlb flushing in tdp_mmu_map_handle_target_level() Since the children SP is zapped, the gfn range of tlb flushing should be the range covered by children SP not parent SP. Replace sp->gfn which is the base gfn of parent SP with iter->gfn and use the correct size of gfn range for children SP to reduce tlb flushing range. Fixes: bb95dfb9e2df ("KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow pages") Signed-off-by: Hou Wenlong Reviewed-by: David Matlack Link: https://lore.kernel.org/r/528ab9c784a486e9ce05f61462ad9260796a8732.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bba33aea0fb0..ce2f86f8a3a5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1080,8 +1080,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && !is_last_spte(iter->old_spte, iter->level)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(iter->level + 1)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* * If the page fault was caused by a write but the page is write -- cgit v1.2.3 From 1b2dc7360463932d2e7cac9461de16ce6fa11cb7 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:15 +0800 Subject: KVM: x86/mmu: Fix wrong start gfn of tlb flushing with range When a spte is dropped, the start gfn of tlb flushing should be the gfn of spte not the base gfn of SP which contains the spte. Also introduce a helper function to do range-based flushing when a spte is dropped, which would help prevent future buggy use of kvm_flush_remote_tlbs_with_address() in such case. Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.") Suggested-by: David Matlack Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/72ac2169a261976f00c1703e88cda676dfb960f5.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++------ arch/x86/kvm/mmu/paging_tmpl.h | 3 +-- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 52be98e0102a..5e34f07e46ba 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -269,6 +269,17 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); + +/* Flush the range of guest memory mapped by the given SPTE. */ +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) +{ + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); + + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); +} + static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -1187,8 +1198,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) drop_spte(kvm, sptep); if (flush) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* @@ -1639,8 +1649,7 @@ static void __rmap_add(struct kvm *kvm, kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); - kvm_flush_remote_tlbs_with_address( - kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } @@ -6521,8 +6530,7 @@ restart: kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index faf193e97061..57f0b75c80f9 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -927,8 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, - sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); if (!rmap_can_add(vcpu)) break; -- cgit v1.2.3 From 3cdf93746f9a9a8a81b044219dcf51ce98bd0abf Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:16 +0800 Subject: KVM: x86/mmu: Fix wrong gfn range of tlb flushing in validate_direct_spte() The spte pointing to the children SP is dropped, so the whole gfn range covered by the children SP should be flushed. Although, Hyper-V may treat a 1-page flush the same if the address points to a huge page, it still would be better to use the correct size of huge page. Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.") Suggested-by: Sean Christopherson Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/5f297c566f7d7ff2ea6da3c66d050f69ce1b8ede.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5e34f07e46ba..47b15c146ba5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2414,7 +2414,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, return; drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } -- cgit v1.2.3 From 4ad980aea7f58632638d036ac5697fccf8234dde Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 10 Oct 2022 20:19:17 +0800 Subject: KVM: x86/mmu: Cleanup range-based flushing for given page Use the new kvm_flush_remote_tlbs_gfn() helper to cleanup the call sites of range-based flushing for given page, which makes the code clear. Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/593ee1a876ece0e819191c0b23f56b940d6686db.1665214747.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 5 ++--- arch/x86/kvm/mmu/tdp_mmu.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47b15c146ba5..936cee46779c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -824,7 +824,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2898,8 +2898,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (flush) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); pgprintk("%s: setting spte %llx\n", __func__, *sptep); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ce2f86f8a3a5..7c25dbf32ecc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -680,8 +680,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, if (ret) return ret; - kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, - KVM_PAGES_PER_HPAGE(iter->level)); + kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* * No other thread can overwrite the removed SPTE as they must either -- cgit v1.2.3 From 11b36fe7d4500c8ef73677c087f302fd713101c2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 14 Jan 2023 10:39:11 +0100 Subject: KVM: x86/mmu: Use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/670882aa04dbdd171b46d3b20ffab87158454616.1673689135.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 936cee46779c..c91ee2927dd7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -6762,7 +6763,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) new_val = 1; else if (sysfs_streq(val, "auto")) new_val = get_nx_auto_mode(); - else if (strtobool(val, &new_val) < 0) + else if (kstrtobool(val, &new_val) < 0) return -EINVAL; __set_nx_huge_pages(new_val); -- cgit v1.2.3 From 6a5cba7bed35580effda9fb1872b274da47e6b23 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 20 Dec 2022 16:12:30 +0000 Subject: KVM: x86/pmu: Correct the mask used in a pmu event filter lookup When checking if a pmu event the guest is attempting to program should be filtered, only consider the event select + unit mask in that decision. Use an architecture specific mask to mask out all other bits, including bits 35:32 on Intel. Those bits are not part of the event select and should not be considered in that decision. Fixes: 66bb8a065f5a ("KVM: x86: PMU Event Filter") Signed-off-by: Aaron Lewis Link: https://lore.kernel.org/r/20221220161236.555143-2-aaronlewis@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 3 ++- arch/x86/kvm/pmu.h | 2 ++ arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d939d3b84e6f..f5b933eeb549 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -279,7 +279,8 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) goto out; if (pmc_is_gp(pmc)) { - key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; + key = pmc->eventsel & (kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK); if (bsearch(&key, filter->events, filter->nevents, sizeof(__u64), cmp_u64)) allow_event = filter->action == KVM_PMU_EVENT_ALLOW; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cdb91009701d..30bfccc6df60 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -40,6 +40,8 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + + const u64 EVENTSEL_EVENT; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 1ff068f23841..5da8c292e3e3 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -231,4 +231,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index efce9ad70e4e..7980fda3978d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -811,4 +811,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, }; -- cgit v1.2.3 From 8589827fd5342b0ae3703e3093a946d708c44f3a Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 20 Dec 2022 16:12:31 +0000 Subject: KVM: x86/pmu: Remove impossible events from the pmu event filter If it's not possible for an event in the pmu event filter to match a pmu event being programmed by the guest, it's pointless to have it in the list. Opt for a shorter list by removing those events. Because this is established uAPI the pmu event filter can't outright rejected these events as garbage and return an error. Instead, play nice and remove them from the list. Also, opportunistically rewrite the comment when the filter is set to clarify that it guards against *all* TOCTOU attacks on the verified data. Signed-off-by: Aaron Lewis Link: https://lore.kernel.org/r/20221220161236.555143-3-aaronlewis@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f5b933eeb549..d29f6393c07e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -594,6 +594,21 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +static void remove_impossible_events(struct kvm_pmu_event_filter *filter) +{ + int i, j; + + for (i = 0, j = 0; i < filter->nevents; i++) { + if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK)) + continue; + + filter->events[j++] = filter->events[i]; + } + + filter->nevents = j; +} + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; @@ -624,9 +639,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) if (copy_from_user(filter, argp, size)) goto cleanup; - /* Ensure nevents can't be changed between the user copies. */ + /* Restore the verified state to guard against TOCTOU attacks. */ *filter = tmp; + remove_impossible_events(filter); + /* * Sort the in-kernel list so that we can search it with bsearch. */ -- cgit v1.2.3 From c5a287fa0dccd3e43a6ea5602191f9ac09a68889 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 20 Dec 2022 16:12:32 +0000 Subject: KVM: x86/pmu: prepare the pmu event filter for masked events Refactor check_pmu_event_filter() in preparation for masked events. No functional changes intended Signed-off-by: Aaron Lewis Link: https://lore.kernel.org/r/20221220161236.555143-4-aaronlewis@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 56 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d29f6393c07e..8a734f4343bb 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -263,41 +263,51 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } +static u64 *find_filter_entry(struct kvm_pmu_event_filter *filter, u64 key) +{ + return bsearch(&key, filter->events, filter->nevents, + sizeof(filter->events[0]), cmp_u64); +} + +static bool is_gp_event_allowed(struct kvm_pmu_event_filter *filter, u64 eventsel) +{ + if (find_filter_entry(filter, eventsel & (kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK))) + return filter->action == KVM_PMU_EVENT_ALLOW; + + return filter->action == KVM_PMU_EVENT_DENY; +} + +static bool is_fixed_event_allowed(struct kvm_pmu_event_filter *filter, int idx) +{ + int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + + return true; +} + static bool check_pmu_event_filter(struct kvm_pmc *pmc) { struct kvm_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - bool allow_event = true; - __u64 key; - int idx; if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) - goto out; + return true; - if (pmc_is_gp(pmc)) { - key = pmc->eventsel & (kvm_pmu_ops.EVENTSEL_EVENT | - ARCH_PERFMON_EVENTSEL_UMASK); - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } else { - idx = pmc->idx - INTEL_PMC_IDX_FIXED; - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - } + if (pmc_is_gp(pmc)) + return is_gp_event_allowed(filter, pmc->eventsel); -out: - return allow_event; + return is_fixed_event_allowed(filter, pmc->idx); } static void reprogram_counter(struct kvm_pmc *pmc) -- cgit v1.2.3 From 14329b825ffb7f2710c13fdcc37fc2e7c67b6781 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 20 Dec 2022 16:12:33 +0000 Subject: KVM: x86/pmu: Introduce masked events to the pmu event filter When building a list of filter events, it can sometimes be a challenge to fit all the events needed to adequately restrict the guest into the limited space available in the pmu event filter. This stems from the fact that the pmu event filter requires each event (i.e. event select + unit mask) be listed, when the intention might be to restrict the event select all together, regardless of it's unit mask. Instead of increasing the number of filter events in the pmu event filter, add a new encoding that is able to do a more generalized match on the unit mask. Introduce masked events as another encoding the pmu event filter understands. Masked events has the fields: mask, match, and exclude. When filtering based on these events, the mask is applied to the guest's unit mask to see if it matches the match value (i.e. umask & mask == match). The exclude bit can then be used to exclude events from that match. E.g. for a given event select, if it's easier to say which unit mask values shouldn't be filtered, a masked event can be set up to match all possible unit mask values, then another masked event can be set up to match the unit mask values that shouldn't be filtered. Userspace can query to see if this feature exists by looking for the capability, KVM_CAP_PMU_EVENT_MASKED_EVENTS. This feature is enabled by setting the flags field in the pmu event filter to KVM_PMU_EVENT_FLAG_MASKED_EVENTS. Events can be encoded by using KVM_PMU_ENCODE_MASKED_ENTRY(). It is an error to have a bit set outside the valid bits for a masked event, and calls to KVM_SET_PMU_EVENT_FILTER will return -EINVAL in such cases, including the high bits of the event select (35:32) if called on Intel. With these updates the filter matching code has been updated to match on a common event. Masked events were flexible enough to handle both event types, so they were used as the common event. This changes how guest events get filtered because regardless of the type of event used in the uAPI, they will be converted to masked events. Because of this there could be a slight performance hit because instead of matching the filter event with a lookup on event select + unit mask, it does a lookup on event select then walks the unit masks to find the match. This shouldn't be a big problem because I would expect the set of common event selects to be small, and if they aren't the set can likely be reduced by using masked events to generalize the unit mask. Using one type of event when filtering guest events allows for a common code path to be used. Signed-off-by: Aaron Lewis Link: https://lore.kernel.org/r/20221220161236.555143-5-aaronlewis@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 78 ++++++++++++++-- arch/x86/include/asm/kvm_host.h | 14 ++- arch/x86/include/uapi/asm/kvm.h | 29 ++++++ arch/x86/kvm/pmu.c | 197 ++++++++++++++++++++++++++++++++++------ arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 1 + 6 files changed, 282 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9807b05a1b57..83e3acc9e321 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5005,6 +5005,15 @@ using this ioctl. :Parameters: struct kvm_pmu_event_filter (in) :Returns: 0 on success, -1 on error +Errors: + + ====== ============================================================ + EFAULT args[0] cannot be accessed + EINVAL args[0] contains invalid data in the filter or filter events + E2BIG nevents is too large + EBUSY not enough memory to allocate the filter + ====== ============================================================ + :: struct kvm_pmu_event_filter { @@ -5016,14 +5025,69 @@ using this ioctl. __u64 events[0]; }; -This ioctl restricts the set of PMU events that the guest can program. -The argument holds a list of events which will be allowed or denied. -The eventsel+umask of each event the guest attempts to program is compared -against the events field to determine whether the guest should have access. -The events field only controls general purpose counters; fixed purpose -counters are controlled by the fixed_counter_bitmap. +This ioctl restricts the set of PMU events the guest can program by limiting +which event select and unit mask combinations are permitted. + +The argument holds a list of filter events which will be allowed or denied. + +Filter events only control general purpose counters; fixed purpose counters +are controlled by the fixed_counter_bitmap. + +Valid values for 'flags':: + +``0`` + +To use this mode, clear the 'flags' field. + +In this mode each event will contain an event select + unit mask. + +When the guest attempts to program the PMU the guest's event select + +unit mask is compared against the filter events to determine whether the +guest should have access. + +``KVM_PMU_EVENT_FLAG_MASKED_EVENTS`` +:Capability: KVM_CAP_PMU_EVENT_MASKED_EVENTS + +In this mode each filter event will contain an event select, mask, match, and +exclude value. To encode a masked event use:: + + KVM_PMU_ENCODE_MASKED_ENTRY() + +An encoded event will follow this layout:: + + Bits Description + ---- ----------- + 7:0 event select (low bits) + 15:8 umask match + 31:16 unused + 35:32 event select (high bits) + 36:54 unused + 55 exclude bit + 63:56 umask mask + +When the guest attempts to program the PMU, these steps are followed in +determining if the guest should have access: + + 1. Match the event select from the guest against the filter events. + 2. If a match is found, match the guest's unit mask to the mask and match + values of the included filter events. + I.e. (unit mask & mask) == match && !exclude. + 3. If a match is found, match the guest's unit mask to the mask and match + values of the excluded filter events. + I.e. (unit mask & mask) == match && exclude. + 4. + a. If an included match is found and an excluded match is not found, filter + the event. + b. For everything else, do not filter the event. + 5. + a. If the event is filtered and it's an allow list, allow the guest to + program the event. + b. If the event is filtered and it's a deny list, do not allow the guest to + program the event. -No flags are defined yet, the field must be zero. +When setting a new pmu event filter, -EINVAL will be returned if any of the +unused fields are set or if any of the high bits (35:32) in the event +select are set when called on Intel. Valid values for 'action':: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d2bc08794e4..cd0151e6af62 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1145,6 +1145,18 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[]; +}; + enum kvm_apicv_inhibit { /********************************************************************/ @@ -1363,7 +1375,7 @@ struct kvm_arch { /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; - struct kvm_pmu_event_filter __rcu *pmu_event_filter; + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e48deab8901d..f142f3ebf4e4 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -525,6 +525,35 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) + +/* + * Masked event layout. + * Bits Description + * ---- ----------- + * 7:0 event select (low bits) + * 15:8 umask match + * 31:16 unused + * 35:32 event select (high bits) + * 36:54 unused + * 55 exclude bit + * 63:56 umask mask + */ + +#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ + (((mask) & 0xFFULL) << 56) | \ + (((match) & 0xFFULL) << 8) | \ + ((__u64)(!!(exclude)) << 55)) + +#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ + (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8a734f4343bb..3264f8e0e8ef 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -255,30 +255,99 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *pa, const void *pb) +static int filter_cmp(const void *pa, const void *pb, u64 mask) { - u64 a = *(u64 *)pa; - u64 b = *(u64 *)pb; + u64 a = *(u64 *)pa & mask; + u64 b = *(u64 *)pb & mask; return (a > b) - (a < b); } -static u64 *find_filter_entry(struct kvm_pmu_event_filter *filter, u64 key) + +static int filter_sort_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | + KVM_PMU_MASKED_ENTRY_EXCLUDE)); +} + +/* + * For the event filter, searching is done on the 'includes' list and + * 'excludes' list separately rather than on the 'events' list (which + * has both). As a result the exclude bit can be ignored. + */ +static int filter_event_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); +} + +static int find_filter_index(u64 *events, u64 nevents, u64 key) +{ + u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), + filter_event_cmp); + + if (!fe) + return -1; + + return fe - events; +} + +static bool is_filter_entry_match(u64 filter_event, u64 umask) +{ + u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); + u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; + + BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> + (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != + ARCH_PERFMON_EVENTSEL_UMASK); + + return (umask & mask) == match; +} + +static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) { - return bsearch(&key, filter->events, filter->nevents, - sizeof(filter->events[0]), cmp_u64); + u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; + u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; + int i, index; + + index = find_filter_index(events, nevents, event_select); + if (index < 0) + return false; + + /* + * Entries are sorted by the event select. Walk the list in both + * directions to process all entries with the targeted event select. + */ + for (i = index; i < nevents; i++) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + for (i = index - 1; i >= 0; i--) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + return false; } -static bool is_gp_event_allowed(struct kvm_pmu_event_filter *filter, u64 eventsel) +static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, + u64 eventsel) { - if (find_filter_entry(filter, eventsel & (kvm_pmu_ops.EVENTSEL_EVENT | - ARCH_PERFMON_EVENTSEL_UMASK))) - return filter->action == KVM_PMU_EVENT_ALLOW; + if (filter_contains_match(f->includes, f->nr_includes, eventsel) && + !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) + return f->action == KVM_PMU_EVENT_ALLOW; - return filter->action == KVM_PMU_EVENT_DENY; + return f->action == KVM_PMU_EVENT_DENY; } -static bool is_fixed_event_allowed(struct kvm_pmu_event_filter *filter, int idx) +static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, + int idx) { int fixed_idx = idx - INTEL_PMC_IDX_FIXED; @@ -294,7 +363,7 @@ static bool is_fixed_event_allowed(struct kvm_pmu_event_filter *filter, int idx) static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - struct kvm_pmu_event_filter *filter; + struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) @@ -604,60 +673,128 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); -static void remove_impossible_events(struct kvm_pmu_event_filter *filter) +static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) +{ + u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | + KVM_PMU_MASKED_ENTRY_UMASK_MASK | + KVM_PMU_MASKED_ENTRY_UMASK_MATCH | + KVM_PMU_MASKED_ENTRY_EXCLUDE; + int i; + + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & ~mask) + return false; + } + + return true; +} + +static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) { int i, j; for (i = 0, j = 0; i < filter->nevents; i++) { + /* + * Skip events that are impossible to match against a guest + * event. When filtering, only the event select + unit mask + * of the guest event is used. To maintain backwards + * compatibility, impossible filters can't be rejected :-( + */ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK)) continue; - - filter->events[j++] = filter->events[i]; + /* + * Convert userspace events to a common in-kernel event so + * only one code path is needed to support both events. For + * the in-kernel events use masked events because they are + * flexible enough to handle both cases. To convert to masked + * events all that's needed is to add an "all ones" umask_mask, + * (unmasked filter events don't support EXCLUDE). + */ + filter->events[j++] = filter->events[i] | + (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); } filter->nevents = j; } +static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) +{ + int i; + + if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) + convert_to_masked_filter(filter); + else if (!is_masked_filter_valid(filter)) + return -EINVAL; + + /* + * Sort entries by event select and includes vs. excludes so that all + * entries for a given event select can be processed efficiently during + * filtering. The EXCLUDE flag uses a more significant bit than the + * event select, and so the sorted list is also effectively split into + * includes and excludes sub-lists. + */ + sort(&filter->events, filter->nevents, sizeof(filter->events[0]), + filter_sort_cmp, NULL); + + i = filter->nevents; + /* Find the first EXCLUDE event (only supported for masked events). */ + if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) + break; + } + } + + filter->nr_includes = i; + filter->nr_excludes = filter->nevents - filter->nr_includes; + filter->includes = filter->events; + filter->excludes = filter->events + filter->nr_includes; + + return 0; +} + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { - struct kvm_pmu_event_filter tmp, *filter; + struct kvm_pmu_event_filter __user *user_filter = argp; + struct kvm_x86_pmu_event_filter *filter; + struct kvm_pmu_event_filter tmp; struct kvm_vcpu *vcpu; unsigned long i; size_t size; int r; - if (copy_from_user(&tmp, argp, sizeof(tmp))) + if (copy_from_user(&tmp, user_filter, sizeof(tmp))) return -EFAULT; if (tmp.action != KVM_PMU_EVENT_ALLOW && tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; - if (tmp.flags != 0) + if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) return -EINVAL; if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; size = struct_size(filter, events, tmp.nevents); - filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + filter = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!filter) return -ENOMEM; + filter->action = tmp.action; + filter->nevents = tmp.nevents; + filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; + filter->flags = tmp.flags; + r = -EFAULT; - if (copy_from_user(filter, argp, size)) + if (copy_from_user(filter->events, user_filter->events, + sizeof(filter->events[0]) * filter->nevents)) goto cleanup; - /* Restore the verified state to guard against TOCTOU attacks. */ - *filter = tmp; - - remove_impossible_events(filter); - - /* - * Sort the in-kernel list so that we can search it with bsearch. - */ - sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + r = prepare_filter_lists(filter); + if (r) + goto cleanup; mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 508074e47bc0..da02a08e21b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4406,6 +4406,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 55155e262646..76156e372f9c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 +#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 5f6015b26ba2e0352c17b9034f1f62c653170713 Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Mon, 28 Nov 2022 20:03:38 +0800 Subject: KVM: SVM: remove redundant ret variable Return value from svm_nmi_blocked() directly instead of taking this in another redundant variable. Signed-off-by: zhang songyi Link: https://lore.kernel.org/r/202211282003389362484@zte.com.cn Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13cf53e7390..f5fd44fa113f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3583,7 +3583,6 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - bool ret; if (!gif_set(svm)) return true; @@ -3591,10 +3590,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vcpu->arch.hflags & HF_NMI_MASK); - - return ret; + return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + (vcpu->arch.hflags & HF_NMI_MASK); } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) -- cgit v1.2.3 From 36b0256789a72f4d9d173f1a37d6fcb5ded4c244 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 9 Nov 2022 19:59:52 +0800 Subject: KVM: svm/avic: Drop "struct kvm_x86_ops" for avic_hardware_setup() Even in commit 4bdec12aa8d6 ("KVM: SVM: Detect X2APIC virtualization (x2AVIC) support"), where avic_hardware_setup() was first introduced, its only pass-in parameter "struct kvm_x86_ops *ops" is not used at all. Clean it up a bit to avoid compiler ranting from LLVM toolchain. Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221109115952.92816-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b3928150a37c..ca684979e90d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1120,7 +1120,7 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +bool avic_hardware_setup(void) { if (!npt_enabled) return false; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f5fd44fa113f..b103fe7cbc82 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5017,7 +5017,7 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); + enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 41eabb098b13..a7073802450f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -637,7 +637,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ ) -bool avic_hardware_setup(struct kvm_x86_ops *ops); +bool avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -- cgit v1.2.3 From a31b531cd2fa2e6b2a736833a94b990d5cf56b40 Mon Sep 17 00:00:00 2001 From: Anish Ghulati Date: Fri, 13 Jan 2023 22:09:23 +0000 Subject: KVM: SVM: Account scratch allocations used to decrypt SEV guest memory Account the temp/scratch allocation used to decrypt unaligned debug accesses to SEV guest memory, the allocation is very much tied to the target VM. Reported-by: Mingwei Zhang Signed-off-by: Anish Ghulati Link: https://lore.kernel.org/r/20230113220923.2834699-1-aghulati@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 273cba809328..a5e4c5ef7c9e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -813,7 +813,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); + tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tpage) return -ENOMEM; -- cgit v1.2.3 From 5a7a64779e7ad21f71824faee234b33654df57bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 00:37:47 +0000 Subject: KVM: VMX: Access @flags as a 32-bit value in __vmx_vcpu_run() Access @flags using 32-bit operands when saving and testing @flags for VMX_RUN_VMRESUME, as using 8-bit operands is unnecessarily fragile due to relying on VMX_RUN_VMRESUME being in bits 0-7. The behavior of treating @flags a single byte is a holdover from when the param was "bool launched", i.e. is not deliberate. Cc: Alexey Dobriyan Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20221119003747.2615229-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 766c6b3ef5ed..cd2f75360bf3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -69,8 +69,8 @@ SYM_FUNC_START(__vmx_vcpu_run) */ push %_ASM_ARG2 - /* Copy @flags to BL, _ASM_ARG3 is volatile. */ - mov %_ASM_ARG3B, %bl + /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3L, %ebx lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp @@ -106,7 +106,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - testb $VMX_RUN_VMRESUME, %bl + test $VMX_RUN_VMRESUME, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -128,7 +128,7 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'testb' above */ + /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ jz .Lvmlaunch /* -- cgit v1.2.3 From e8733482f59e23a8bf79526b7f83ed0d0ff5a9b2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 18 Nov 2022 20:05:21 +0300 Subject: KVM: VMX: don't use "unsigned long" in vmx_vcpu_enter_exit() __vmx_vcpu_run_flags() returns "unsigned int" and uses only 2 bits of it so using "unsigned long" is very much pointless. Furthermore, __vmx_vcpu_run() and vmx_spec_ctrl_restore_host() take an "unsigned int", i.e. actually relying on an "unsigned long" value won't work. Signed-off-by: Alexey Dobriyan Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/Y3e7UW0WNV2AZmsZ@p183 Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c788aa382611..506c324d46b1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7172,7 +7172,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx, - unsigned long flags) + unsigned int flags) { guest_state_enter_irqoff(); -- cgit v1.2.3 From fc9465be8aad2042978590d44c01350534c1ac11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:06 +0000 Subject: KVM: x86: Make vmx_get_exit_qual() and vmx_get_intr_info() noinstr-friendly Add an extra special noinstr-friendly helper to test+mark a "register" available and use it when caching vmcs.EXIT_QUALIFICATION and vmcs.VM_EXIT_INTR_INFO. Make the caching helpers __always_inline too so that they can be used in noinstr functions. A future fix will move VMX's handling of NMI exits into the noinstr vmx_vcpu_enter_exit() so that the NMI is processed before any kind of instrumentation can trigger a fault and thus IRET, i.e. so that KVM doesn't invoke the NMI handler with NMIs enabled. Cc: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_cache_regs.h | 12 ++++++++++++ arch/x86/kvm/vmx/vmx.h | 14 ++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..4c91f626c058 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -75,6 +75,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } +/* + * kvm_register_test_and_mark_available() is a special snowflake that uses an + * arch bitop directly to avoid the explicit instrumentation that comes with + * the generic bitops. This allows code that cannot be instrumented (noinstr + * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers. + */ +static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + /* * The "raw" register helpers are only for cases where the full 64 bits of a * register are read/written irrespective of current vCPU mode. In other words, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a3da84f4ea45..bb720a2f11ab 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -669,25 +669,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - } + return vmx->exit_qualification; } -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - } + return vmx->exit_intr_info; } -- cgit v1.2.3 From 8578f59657c505982e1d05232272c6bf304cf8aa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:07 +0000 Subject: KVM: VMX: Allow VM-Fail path of VMREAD helper to be instrumented Allow instrumentation in the VM-Fail path of __vmcs_readl() so that the helper can be used in noinstr functions, e.g. to get the exit reason in vmx_vcpu_enter_exit() in order to handle NMI VM-Exits in the noinstr section. While allowing instrumentation isn't technically safe, KVM has much bigger problems if VMREAD fails in a noinstr section. Note, all other VMX instructions also allow instrumentation in their VM-Fail paths for similar reasons, VMREAD was simply omitted by commit 3ebccdf373c2 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") because VMREAD wasn't used in a noinstr section at the time. Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx_ops.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index a5282014616c..db95bde52998 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: + instrumentation_begin(); WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + instrumentation_end(); return 0; do_exception: -- cgit v1.2.3 From 11633f69506d038120925691626f2851203af241 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:08 +0000 Subject: KVM: VMX: Always inline eVMCS read/write helpers Tag all evmcs_{read,write}() helpers __always_inline so that they can be freely used in noinstr sections, e.g. to get the VM-Exit reason in vcpu_vmx_enter_exit() (in a future patch). For consistency and to avoid more spot fixes in the future, e.g. see commit 010050a86393 ("x86/kvm: Always inline evmcs_write64()"), tag all accessors even though evmcs_read32() is the only anticipated use case in the near future. In practice, non-KASAN builds are all but guaranteed to inline the helpers anyways. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x107: call to evmcs_read32() leaves .noinstr.text section Reported-by: kernel test robot Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index ab08a9b9ab7d..caf658726169 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -196,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write32(unsigned long field, u32 value) +static __always_inline void evmcs_write32(unsigned long field, u32 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -208,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write16(unsigned long field, u16 value) +static __always_inline void evmcs_write16(unsigned long field, u16 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -220,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline u64 evmcs_read64(unsigned long field) +static __always_inline u64 evmcs_read64(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -230,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field) return *(u64 *)((char *)current_evmcs + offset); } -static inline u32 evmcs_read32(unsigned long field) +static __always_inline u32 evmcs_read32(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -240,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field) return *(u32 *)((char *)current_evmcs + offset); } -static inline u16 evmcs_read16(unsigned long field) +static __always_inline u16 evmcs_read16(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -274,11 +274,11 @@ static inline void evmcs_load(u64 phys_addr) void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } +static __always_inline void evmcs_write32(unsigned long field, u32 value) {} +static __always_inline void evmcs_write16(unsigned long field, u16 value) {} +static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } +static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } +static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ -- cgit v1.2.3 From 432727f1cb6e35cb3416d5aeea3dd281e460cbab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:09 +0000 Subject: KVM: VMX: Always inline to_vmx() and to_kvm_vmx() Tag to_vmx() and to_kvm_vmx() __always_inline as they both just reflect the passed in pointer (the embedded struct is the first field in the container), and drop the @vmx param from vmx_vcpu_enter_exit(), which likely existed purely to make noinstr validation happy. Amusingly, when the compiler decides to not inline the helpers, e.g. for KASAN builds, to_vmx() and to_kvm_vmx() may end up pointing at the same symbol, which generates very confusing objtool warnings. E.g. the use of to_vmx() in a future patch led to objtool complaining about to_kvm_vmx(), and only once all use of to_kvm_vmx() was commented out did to_vmx() pop up in the obj tool report. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x160: call to to_kvm_vmx() leaves .noinstr.text section Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 5 +++-- arch/x86/kvm/vmx/vmx.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 506c324d46b1..59a051350f8f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7171,9 +7171,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx, unsigned int flags) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ @@ -7291,7 +7292,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); + vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bb720a2f11ab..2acdc54bc34b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -640,12 +640,12 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); } -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } -- cgit v1.2.3 From 54a3b70a75dcde8173e3d8ccc60f9ecd7af7b5f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:10 +0000 Subject: x86/entry: KVM: Use dedicated VMX NMI entry for 32-bit kernels too Use a dedicated entry for invoking the NMI handler from KVM VMX's VM-Exit path for 32-bit even though using a dedicated entry for 32-bit isn't strictly necessary. Exposing a single symbol will allow KVM to reference the entry point in assembly code without having to resort to more #ifdefs (or #defines). identry.h is intended to be included from asm files only once, and so simply including idtentry.h in KVM assembly isn't an option. Bypassing the ESP fixup and CR3 switching in the standard NMI entry code is safe as KVM always handles NMIs that occur in the guest on a kernel stack, with a kernel CR3. Cc: Andy Lutomirski Cc: Thomas Gleixner Reviewed-by: Lai Jiangshan Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/idtentry.h | 16 ++++++---------- arch/x86/kernel/nmi.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 4 ++-- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 72184b0b2219..b241af4ce9b4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); /* NMI */ -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_KVM_INTEL) /* - * Special NOIST entry point for VMX which invokes this on the kernel - * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI - * 'executing' marker. - * - * On 32bit this just uses the regular NMI entry point because 32-bit does - * not have ISTs. + * Special entry point for VMX which invokes this on the kernel stack, even for + * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work + * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well + * to avoid more ifdeffery. */ -DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); -#else -#define asm_exc_nmi_noist asm_exc_nmi +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx); #endif DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cec0bfa3bc04..e37faba95bb5 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -527,14 +527,14 @@ nmi_restart: mds_user_clear_cpu_buffers(); } -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) -DEFINE_IDTENTRY_RAW(exc_nmi_noist) +#if IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -#endif #if IS_MODULE(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); +#endif #endif void stop_nmi(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 59a051350f8f..84e808af892a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6866,7 +6866,7 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + bool is_nmi = entry == (unsigned long)asm_exc_nmi_kvm_vmx; kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); @@ -6895,7 +6895,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_kvm_vmx; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ -- cgit v1.2.3 From 4f76e86f7e0dc33af14256d30177bf65de2f9cab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:11 +0000 Subject: KVM: VMX: Provide separate subroutines for invoking NMI vs. IRQ handlers Split the asm subroutines for handling NMIs versus IRQs that occur in the guest so that the NMI handler can be called from a noinstr section. As a bonus, the NMI path doesn't need an indirect branch. Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 70 ++++++++++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.c | 26 ++++++++--------- 2 files changed, 50 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index cd2f75360bf3..f0519f0f738c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -31,6 +31,39 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif +.macro VMX_DO_EVENT_IRQOFF call_insn call_target + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + \call_insn \call_target + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + RET +.endm + .section .noinstr.text, "ax" /** @@ -320,35 +353,10 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) - /* - * Unconditionally create a stack frame, getting the correct RSP on the - * stack (for x86-64) would take two instructions anyways, and RBP can - * be used to restore RSP to make objtool happy (see below). - */ - push %_ASM_BP - mov %_ASM_SP, %_ASM_BP +SYM_FUNC_START(vmx_do_nmi_irqoff) + VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(vmx_do_nmi_irqoff) -#ifdef CONFIG_X86_64 - /* - * Align RSP to a 16-byte boundary (to emulate CPU behavior) before - * creating the synthetic interrupt stack frame for the IRQ/NMI. - */ - and $-16, %rsp - push $__KERNEL_DS - push %rbp -#endif - pushf - push $__KERNEL_CS - CALL_NOSPEC _ASM_ARG1 - - /* - * "Restore" RSP from RBP, even though IRET has already unwound RSP to - * the correct value. objtool doesn't know the callee will IRET and, - * without the explicit restore, thinks the stack is getting walloped. - * Using an unwind hint is problematic due to x86-64's dynamic alignment. - */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP - RET -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) +SYM_FUNC_START(vmx_do_interrupt_irqoff) + VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 +SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 84e808af892a..1d9e3fd6584e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6861,17 +6861,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -void vmx_do_interrupt_nmi_irqoff(unsigned long entry); - -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, - unsigned long entry) -{ - bool is_nmi = entry == (unsigned long)asm_exc_nmi_kvm_vmx; - - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); - vmx_do_interrupt_nmi_irqoff(entry); - kvm_after_interrupt(vcpu); -} +void vmx_do_interrupt_irqoff(unsigned long entry); +void vmx_do_nmi_irqoff(void); static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { @@ -6895,7 +6886,6 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_kvm_vmx; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6908,8 +6898,11 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); + else if (is_nmi(intr_info)) { + kvm_before_interrupt(&vmx->vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(&vmx->vcpu); + } } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6922,7 +6915,10 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); + vmx_do_interrupt_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); + vcpu->arch.at_instruction_boundary = true; } -- cgit v1.2.3 From 11df586d774f4aab1835144fd2a8dc3cb2add8d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Dec 2022 06:09:12 +0000 Subject: KVM: VMX: Handle NMI VM-Exits in noinstr region Move VMX's handling of NMI VM-Exits into vmx_vcpu_enter_exit() so that the NMI is handled prior to leaving the safety of noinstr. Handling the NMI after leaving noinstr exposes the kernel to potential ordering problems as an instrumentation-induced fault, e.g. #DB, #BP, #PF, etc. will unblock NMIs when IRETing back to the faulting instruction. Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221213060912.654668-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmcs.h | 4 ++-- arch/x86/kvm/vmx/vmenter.S | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 34 +++++++++++++++++++++------------- arch/x86/kvm/x86.h | 6 +++--- 4 files changed, 30 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index ac290a44a693..7c1996b433e2 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -75,7 +75,7 @@ struct loaded_vmcs { struct vmcs_controls_shadow controls_shadow; }; -static inline bool is_intr_type(u32 intr_info, u32 type) +static __always_inline bool is_intr_type(u32 intr_info, u32 type) { const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK; @@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION); } -static inline bool is_nmi(u32 intr_info) +static __always_inline bool is_nmi(u32 intr_info) { return is_intr_type(intr_info, INTR_TYPE_NMI_INTR); } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f0519f0f738c..f550540ed54e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -299,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) +SYM_FUNC_START(vmx_do_nmi_irqoff) + VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(vmx_do_nmi_irqoff) + .section .text, "ax" @@ -353,10 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif -SYM_FUNC_START(vmx_do_nmi_irqoff) - VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx -SYM_FUNC_END(vmx_do_nmi_irqoff) - SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d9e3fd6584e..664994e3e909 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5170,8 +5170,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by + * vmx_vcpu_enter_exit(). + */ if (is_machine_check(intr_info) || is_nmi(intr_info)) - return 1; /* handled by handle_exception_nmi_irqoff() */ + return 1; /* * Queue the exception here instead of in handle_nm_fault_irqoff(). @@ -6884,7 +6889,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); @@ -6897,12 +6902,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); - kvm_after_interrupt(&vmx->vcpu); - } } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6932,7 +6931,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_nmi_irqoff(vmx); + handle_exception_irqoff(vmx); } /* @@ -7194,6 +7193,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); + if (unlikely(vmx->fail)) + vmx->exit_reason.full = 0xdead; + else + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && + is_nmi(vmx_get_intr_info(vcpu))) { + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); + } + guest_state_exit_irqoff(); } @@ -7335,12 +7346,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..44d1827f0a30 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -382,13 +382,13 @@ enum kvm_intr_type { KVM_HANDLING_NMI, }; -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, - enum kvm_intr_type intr) +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } -- cgit v1.2.3 From 2eb398df77a1bae6839df39193f6bca35ea65f87 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Wed, 16 Nov 2022 17:18:43 +0800 Subject: KVM: x86: Replace IS_ERR() with IS_ERR_VALUE() Avoid type casts that are needed for IS_ERR() and use IS_ERR_VALUE() instead. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202211161718436948912@zte.com.cn Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 65b401e94cec..d7d5bca00294 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12274,7 +12274,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, */ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)hva)) + if (IS_ERR_VALUE(hva)) return (void __user *)hva; } else { if (!slot || !slot->npages) -- cgit v1.2.3 From 2a3003e9507c0315e0642247230899485c488ff8 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 5 Dec 2022 20:20:48 +0800 Subject: KVM: x86/pmu: Drop event_type and rename "struct kvm_event_hw_type_mapping" After commit ("02791a5c362b KVM: x86/pmu: Use PERF_TYPE_RAW to merge reprogram_{gp,fixed}counter()"), vPMU starts to directly use the hardware event eventsel and unit_mask to reprogram perf_event, and the event_type field in the "struct kvm_event_hw_type_mapping" is simply no longer being used. Convert the struct into an anonymous struct as the current name is obsolete as the structure no longer has any mapping semantics, and placing the struct definition directly above its sole user makes its easier to understand what the array is filling in. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20221205122048.16023-1-likexu@tencent.com [sean: drop new comment, use anonymous struct] Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 6 ------ arch/x86/kvm/vmx/pmu_intel.c | 21 ++++++++++++--------- 2 files changed, 12 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 30bfccc6df60..080a3bbbeda3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,12 +18,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -struct kvm_event_hw_type_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; -}; - struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7980fda3978d..ef24d254cf06 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -22,16 +22,19 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) -static struct kvm_event_hw_type_mapping intel_arch_events[] = { - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +static struct { + u8 eventsel; + u8 unit_mask; +} const intel_arch_events[] = { + [0] = { 0x3c, 0x00 }, + [1] = { 0xc0, 0x00 }, + [2] = { 0x3c, 0x01 }, + [3] = { 0x2e, 0x4f }, + [4] = { 0x2e, 0x41 }, + [5] = { 0xc4, 0x00 }, + [6] = { 0xc5, 0x00 }, /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ -- cgit v1.2.3 From 8911ce66697e04ce7e92753dbf5645f748e27e12 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:00 +0000 Subject: KVM: x86/pmu: Cap kvm_pmu_cap.num_counters_gp at KVM's internal max Limit kvm_pmu_cap.num_counters_gp during kvm_init_pmu_capability() based on the vendor PMU capabilities so that consuming num_counters_gp naturally does the right thing. This fixes a mostly theoretical bug where KVM could over-report its PMU support in KVM_GET_SUPPORTED_CPUID for leaf 0xA, e.g. if the number of counters reported by perf is greater than KVM's hardcoded internal limit. Incorporating input from the AMD PMU also avoids over-reporting MSRs to save when running on AMD. Link: https://lore.kernel.org/r/20230124234905.3774678-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 5 ++++- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 080a3bbbeda3..79988dafb15b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -36,6 +36,7 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); const u64 EVENTSEL_EVENT; + const int MAX_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -157,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -static inline void kvm_init_pmu_capability(void) +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; @@ -176,6 +177,8 @@ static inline void kvm_init_pmu_capability(void) } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5da8c292e3e3..cc77a0681800 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -232,4 +232,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .init = amd_pmu_init, .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ef24d254cf06..e8a3be0b9df9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -815,4 +815,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da02a08e21b5..ad95ce92a154 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7061,12 +7061,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + kvm_pmu_cap.num_counters_gp) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + kvm_pmu_cap.num_counters_gp) continue; break; case MSR_IA32_XFD: @@ -9386,7 +9386,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_pmu_capability(); + kvm_init_pmu_capability(ops->pmu_ops); r = ops->hardware_setup(); if (r != 0) -- cgit v1.2.3 From e76ae52747a82a548742107b4100e90da41a624d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:01 +0000 Subject: KVM: x86/pmu: Gate all "unimplemented MSR" prints on report_ignored_msrs Add helpers to print unimplemented MSR accesses and condition all such prints on report_ignored_msrs, i.e. honor userspace's request to not print unimplemented MSRs. Even though vcpu_unimpl() is ratelimited, printing can still be problematic, e.g. if a print gets stalled when host userspace is writing MSRs during live migration, an effective stall can result in very noticeable disruption in the guest. E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU counters while the PMU was disabled in KVM. - 99.75% 0.00% [.] __ioctl - __ioctl - 99.74% entry_SYSCALL_64_after_hwframe do_syscall_64 sys_ioctl - do_vfs_ioctl - 92.48% kvm_vcpu_ioctl - kvm_arch_vcpu_ioctl - 85.12% kvm_set_msr_ignored_check svm_set_msr kvm_set_msr_common printk vprintk_func vprintk_default vprintk_emit console_unlock call_console_drivers univ8250_console_write serial8250_console_write uart_console_write Reported-by: Aaron Lewis Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 5 ++--- arch/x86/kvm/vmx/vmx.c | 4 +--- arch/x86/kvm/x86.c | 18 +++++------------- arch/x86/kvm/x86.h | 12 ++++++++++++ 5 files changed, 24 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 71aff0edc0ed..3eb8caf87ee4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1430,8 +1430,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1552,8 +1551,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1608,7 +1606,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1673,7 +1671,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13cf53e7390..dd21e8b1a259 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3015,8 +3015,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3045,7 +3044,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c788aa382611..8f0f67c75f35 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2206,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ad95ce92a154..d4a610ffe2b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3560,7 +3560,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -3606,15 +3605,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3794,16 +3791,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3831,9 +3825,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..f3554bf05201 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs; extern bool eager_page_split; +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, -- cgit v1.2.3 From 2374b7310b662e29e3468d510bfaded60fbae99c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:02 +0000 Subject: KVM: x86/pmu: Use separate array for defining "PMU MSRs to save" Move all potential to-be-saved PMU MSRs into a separate array so that a future patch can easily omit all PMU MSRs from the list when the PMU is disabled. No functional change intended. Link: https://lore.kernel.org/r/20230124234905.3774678-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 153 ++++++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d4a610ffe2b8..9b6e1af63531 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1436,6 +1436,10 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, @@ -1460,11 +1464,10 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -6994,84 +6997,92 @@ out: return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - kvm_pmu_cap.num_counters_gp) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - kvm_pmu_cap.num_counters_gp) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) -- cgit v1.2.3 From c3531edc79a7adbe4451d34084e8bf227c022872 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:03 +0000 Subject: KVM: x86/pmu: Don't tell userspace to save PMU MSRs if PMU is disabled Omit all PMU MSRs from the "MSRs to save" list if the PMU is disabled so that userspace doesn't waste time saving and restoring dummy values. KVM provides "error" semantics (read zeros, drop writes) for such known-but- unsupported MSRs, i.e. has fudged around this issue for quite some time. Keep the "error" semantics as-is for now, the logic will be cleaned up in a separate patch. Cc: Aaron Lewis Cc: Weijiang Yang Link: https://lore.kernel.org/r/20230124234905.3774678-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b6e1af63531..25da2cc09e55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7081,8 +7081,10 @@ static void kvm_init_msr_list(void) for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) kvm_probe_msr_to_save(msrs_to_save_base[i]); - for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) - kvm_probe_msr_to_save(msrs_to_save_pmu[i]); + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); + } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) -- cgit v1.2.3 From e33b6d79acac169bfe1a9682fc5b4f8202fa4c41 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 24 Jan 2023 23:49:04 +0000 Subject: KVM: x86/pmu: Don't tell userspace to save MSRs for non-existent fixed PMCs Limit the set of MSRs for fixed PMU counters based on the number of fixed counters actually supported by the host so that userspace doesn't waste time saving and restoring dummy values. Signed-off-by: Like Xu [sean: split for !enable_pmu logic, drop min(), write changelog] Link: https://lore.kernel.org/r/20230124234905.3774678-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cd0151e6af62..adb92fc4d7c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -514,6 +514,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25da2cc09e55..3c49c86b973d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7055,6 +7055,11 @@ static void kvm_probe_msr_to_save(u32 msr_index) kvm_pmu_cap.num_counters_gp) return; break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; case MSR_IA32_XFD: case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) -- cgit v1.2.3 From 2de154f541fc5b9f2aed3fe06e218130718ce320 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:05 +0000 Subject: KVM: x86/pmu: Provide "error" semantics for unsupported-but-known PMU MSRs Provide "error" semantics (read zeros, drop writes) for userspace accesses to MSRs that are ultimately unsupported for whatever reason, but for which KVM told userspace to save and restore the MSR, i.e. for MSRs that KVM included in KVM_GET_MSR_INDEX_LIST. Previously, KVM special cased a few PMU MSRs that were problematic at one point or another. Extend the treatment to all PMU MSRs, e.g. to avoid spurious unsupported accesses. Note, the logic can also be used for non-PMU MSRs, but as of today only PMU MSRs can end up being unsupported after KVM told userspace to save and restore them. Link: https://lore.kernel.org/r/20230124234905.3774678-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c49c86b973d..64c567a1b32b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3561,6 +3561,18 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +static bool kvm_is_msr_to_save(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + return false; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; @@ -3876,20 +3888,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); + /* * Userspace is allowed to write '0' to MSRs that KVM reports * as to-be-saved, even if an MSRs isn't fully supported. */ - return !msr_info->host_initiated || data; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); + if (msr_info->host_initiated && !data && + kvm_is_msr_to_save(msr)) + break; + return KVM_MSR_RET_INVALID; } return 0; @@ -3979,20 +3989,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (!msr_info->host_initiated) - return 1; - msr_info->data = 0; - break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -4248,6 +4244,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ + if (msr_info->host_initiated && + kvm_is_msr_to_save(msr_info->index)) { + msr_info->data = 0; + break; + } + return KVM_MSR_RET_INVALID; } return 0; -- cgit v1.2.3 From 8957cbcfed0a7e423695d90600b2643dda31fc7b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 29 Nov 2022 21:37:07 +0200 Subject: KVM: nSVM: Don't sync tlb_ctl back to vmcb12 on nested VM-Exit Don't sync the TLB control field from vmcb02 to vmcs12 on nested VM-Exit. Per AMD's APM, the field is not modified by hardware: The VMRUN instruction reads, but does not change, the value of the TLB_CONTROL field Signed-off-by: Maxim Levitsky Tested-by: Santosh Shukla Link: https://lore.kernel.org/r/20221129193717.513824-2-mlevitsk@redhat.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 700df66d23c7..05d38944a6c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1008,7 +1008,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; - vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; -- cgit v1.2.3 From c760e86f27fed4aeb46e206a6476b82d8e2d1762 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 30 Jan 2023 16:32:53 -0800 Subject: KVM: x86: Move HF_GIF_MASK into "struct vcpu_svm" as "guest_gif" Move HF_GIF_MASK out of the common "hflags" and into vcpu_svm.guest_gif. GIF is an SVM-only concept and has should never be consulted outside of SVM-specific code. No functional change is intended. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Tested-by: Santosh Shukla Link: https://lore.kernel.org/r/20221129193717.513824-5-mlevitsk@redhat.com [sean: split to separate patch] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/svm.h | 9 ++++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d2bc08794e4..e743220b10c3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2074,7 +2074,6 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_GIF_MASK (1 << 0) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a7073802450f..45875c961df8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -273,6 +273,9 @@ struct vcpu_svm { bool guest_state_loaded; bool x2avic_msrs_intercepted; + + /* Guest GIF value, used when vGIF is not enabled */ + bool guest_gif; }; struct svm_cpu_data { @@ -490,7 +493,7 @@ static inline void enable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl |= V_GIF_MASK; else - svm->vcpu.arch.hflags |= HF_GIF_MASK; + svm->guest_gif = true; } static inline void disable_gif(struct vcpu_svm *svm) @@ -500,7 +503,7 @@ static inline void disable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl &= ~V_GIF_MASK; else - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + svm->guest_gif = false; } static inline bool gif_set(struct vcpu_svm *svm) @@ -510,7 +513,7 @@ static inline bool gif_set(struct vcpu_svm *svm) if (vmcb) return !!(vmcb->control.int_ctl & V_GIF_MASK); else - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); + return svm->guest_gif; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) -- cgit v1.2.3 From 916b54a7688b0b9a1c48c708b848e4348c3ae2ab Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 30 Jan 2023 17:20:03 -0800 Subject: KVM: x86: Move HF_NMI_MASK and HF_IRET_MASK into "struct vcpu_svm" Move HF_NMI_MASK and HF_IRET_MASK (a.k.a. "waiting for IRET") out of the common "hflags" and into dedicated flags in "struct vcpu_svm". The flags are used only for the SVM and thus should not be in hflags. Tracking NMI masking in software isn't SVM specific, e.g. VMX has a similar flag (soft_vnmi_blocked), but that's much more of a hack as VMX can't intercept IRET, is useful only for ancient CPUs, i.e. will hopefully be removed at some point, and again the exact behavior is vendor specific and shouldn't ever be referenced in common code. converting VMX No functional change is intended. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Tested-by: Santosh Shukla Link: https://lore.kernel.org/r/20221129193717.513824-5-mlevitsk@redhat.com [sean: split from HF_GIF_MASK patch] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/svm/svm.c | 22 +++++++++++++--------- arch/x86/kvm/svm/svm.h | 18 ++++++++++++++++++ 3 files changed, 31 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e743220b10c3..e653b0b7ff77 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2074,8 +2074,6 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_NMI_MASK (1 << 3) -#define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b103fe7cbc82..e1061d077b3d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1338,6 +1338,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; + svm->nmi_masked = false; + svm->awaiting_iret_completion = false; + if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); } @@ -2482,7 +2485,7 @@ static int iret_interception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); ++vcpu->stat.nmi_window_exits; - vcpu->arch.hflags |= HF_IRET_MASK; + svm->awaiting_iret_completion = true; if (!sev_es_guest(vcpu->kvm)) { svm_clr_intercept(svm, INTERCEPT_IRET); svm->nmi_iret_rip = kvm_rip_read(vcpu); @@ -3478,7 +3481,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) if (svm->nmi_l1_to_l2) return; - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; @@ -3591,7 +3594,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return false; return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vcpu->arch.hflags & HF_NMI_MASK); + svm->nmi_masked; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3611,7 +3614,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { - return !!(vcpu->arch.hflags & HF_NMI_MASK); + return to_svm(vcpu)->nmi_masked; } static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3619,11 +3622,11 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) struct vcpu_svm *svm = to_svm(vcpu); if (masked) { - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); } else { - vcpu->arch.hflags &= ~HF_NMI_MASK; + svm->nmi_masked = false; if (!sev_es_guest(vcpu->kvm)) svm_clr_intercept(svm, INTERCEPT_IRET); } @@ -3709,7 +3712,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) + if (svm->nmi_masked && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3833,10 +3836,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((vcpu->arch.hflags & HF_IRET_MASK) && + if (svm->awaiting_iret_completion && (sev_es_guest(vcpu->kvm) || kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { - vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + svm->awaiting_iret_completion = false; + svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 45875c961df8..839809972da1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -230,8 +230,26 @@ struct vcpu_svm { struct svm_nested_state nested; + /* NMI mask value, used when vNMI is not enabled */ + bool nmi_masked; + + /* + * True when NMIs are still masked but guest IRET was just intercepted + * and KVM is waiting for RIP to change, which will signal that the + * intercepted IRET was retired and thus NMI can be unmasked. + */ + bool awaiting_iret_completion; + + /* + * Set when KVM is awaiting IRET completion and needs to inject NMIs as + * soon as the IRET completes (e.g. NMI is pending injection). KVM + * temporarily steals RFLAGS.TF to single-step the guest in this case + * in order to regain control as soon as the NMI-blocking condition + * goes away. + */ bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; unsigned long soft_int_csbase; -- cgit v1.2.3 From 32e69f232db4ca11f26e5961daeff93906ce232f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 29 Nov 2022 21:37:11 +0200 Subject: KVM: x86: Use emulator callbacks instead of duplicating "host flags" Instead of re-defining the "host flags" bits, just expose dedicated helpers for each of the two remaining flags that are consumed by the emulator. The emulator never consumes both "is guest" and "is SMM" in close proximity, so there is no motivation to avoid additional indirect branches. Also while at it, garbage collect the recently removed host flags. No functional change is intended. Signed-off-by: Maxim Levitsky Tested-by: Santosh Shukla Link: https://lore.kernel.org/r/20221129193717.513824-6-mlevitsk@redhat.com [sean: fix CONFIG_KVM_SMM=n builds, tweak names of wrappers] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 6 +++--- arch/x86/kvm/emulate.c | 11 +++++------ arch/x86/kvm/kvm_emulate.h | 7 ++----- arch/x86/kvm/smm.c | 2 -- arch/x86/kvm/x86.c | 14 +++++++++----- 5 files changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e653b0b7ff77..3b0436b1f93c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2074,11 +2074,11 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM -#define HF_SMM_MASK (1 << 6) -#define HF_SMM_INSIDE_NMI_MASK (1 << 7) +#define HF_SMM_MASK (1 << 1) +#define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c3443045cd93..5f7d9082c835 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2310,7 +2310,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int em_rsm(struct x86_emulate_ctxt *ctxt) { - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) + if (!ctxt->ops->is_smm(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->leave_smm(ctxt)) @@ -5133,7 +5133,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - unsigned emul_flags; + bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5148,7 +5148,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5182,7 +5181,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(is_guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5211,7 +5210,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5265,7 +5264,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 2d9662be8333..ab65f3a47dfd 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -220,7 +220,8 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); - unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + bool (*is_smm)(struct x86_emulate_ctxt *ctxt); + bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -275,10 +276,6 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; -/* These match some of the HF_* flags defined in kvm_host.h */ -#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ -#define X86EMUL_SMM_MASK (1 << 6) - /* * fastop functions are declared as taking a never-defined fastop parameter, * so they can't be called from C directly. diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index cc43638d48a3..b42111a24cc2 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -111,8 +111,6 @@ static void check_smram_offsets(void) void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); if (entering_smm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 508074e47bc0..cf9e3d213c83 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8150,9 +8150,14 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } -static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) { - return emul_to_vcpu(ctxt)->arch.hflags; + return is_smm(emul_to_vcpu(ctxt)); +} + +static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) +{ + return is_guest_mode(emul_to_vcpu(ctxt)); } #ifndef CONFIG_KVM_SMM @@ -8221,7 +8226,8 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, - .get_hflags = emulator_get_hflags, + .is_smm = emulator_is_smm, + .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8293,8 +8299,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->exception.vector = -1; -- cgit v1.2.3 From 1a9df3262a632020071327abf56e9243e4dd7bde Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Mon, 12 Dec 2022 10:37:15 -0800 Subject: KVM: x86: hyper-v: Use common code for hypercall userspace exit Remove duplicate code to exit to userspace for hyper-v hypercalls and use a common place to exit. No functional change intended. Signed-off-by: Vipin Sharma Suggested-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20221212183720.4062037-9-vipinsh@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 71aff0edc0ed..c3d1c5212b0b 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2531,14 +2531,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2597,14 +2590,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_OPERATION_DENIED; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; } default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; @@ -2613,6 +2599,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); + +hypercall_userspace_exit: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; + return 0; } void kvm_hv_init_vm(struct kvm *kvm) -- cgit v1.2.3 From db9cf24cea69773410f0049bdfa795d7c2bd0ea9 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Mon, 12 Dec 2022 10:37:16 -0800 Subject: KVM: x86: hyper-v: Add extended hypercall support in Hyper-v Add support for extended hypercall in Hyper-v. Hyper-v TLFS 6.0b describes hypercalls above call code 0x8000 as extended hypercalls. A Hyper-v hypervisor's guest VM finds availability of extended hypercalls via CPUID.0x40000003.EBX BIT(20). If the bit is set then the guest can call extended hypercalls. All extended hypercalls will exit to userspace by default. This allows for easy support of future hypercalls without being dependent on KVM releases. If there will be need to process the hypercall in KVM instead of userspace then KVM can create a capability which userspace can query to know which hypercalls can be handled by the KVM and enable handling of those hypercalls. Signed-off-by: Vipin Sharma Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20221212183720.4062037-10-vipinsh@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c3d1c5212b0b..f610b07ddcf4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -44,6 +44,24 @@ #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) +/* + * As per Hyper-V TLFS, extended hypercalls start from 0x8001 + * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value + * where each bit tells which extended hypercall is available besides + * HvExtCallQueryCapabilities. + * + * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit + * assigned. + * + * 0x8002 - Bit 0 + * 0x8003 - Bit 1 + * .. + * 0x8041 - Bit 63 + * + * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 + */ +#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) + static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -2439,6 +2457,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + return hv_vcpu->cpuid_cache.features_ebx & + HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } @@ -2592,6 +2613,12 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } goto hypercall_userspace_exit; } + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -2751,6 +2778,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; -- cgit v1.2.3 From 052c3b99cbc8d227f8cb8edf1519197808d1d653 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Tue, 10 Jan 2023 10:40:33 -0800 Subject: KVM: x86: Reinitialize xAPIC ID when userspace forces x2APIC => xAPIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reinitialize the xAPIC ID to the vCPU ID when userspace forces the APIC to transition directly from x2APIC to xAPIC mode, e.g. to emulate RESET. KVM already stuffs the xAPIC ID when the APIC is transitioned from DISABLED to xAPIC (commit 49bd29ba1dbd ("KVM: x86: reset APIC ID when enabling LAPIC")), i.e. userspace is conditioned to expect KVM to update the xAPIC ID, but KVM doesn't handle the architecturally-impossible case where userspace forces x2APIC=>xAPIC via KVM_SET_MSRS. On its own, the "bug" is benign, as userspace emulation of RESET will also stuff APIC registers via KVM_SET_LAPIC, i.e. will manually set the xAPIC ID. However, commit 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") introduced a bug, fixed by commit commit ef40757743b4 ("KVM: x86: fix APICv/x2AVIC disabled when vm reboot by itself"), that caused KVM to fail to properly update the xAPIC ID when handling KVM_SET_LAPIC. Refresh the xAPIC ID even though it's not strictly necessary so that KVM provides consistent behavior. Note, KVM follows Intel architecture with regard to handling the xAPIC ID and x2APIC IDs across mode transitions. For the APIC DISABLED case (commit 49bd29ba1dbd), Intel's SDM says the xAPIC ID _may_ be reinitialized 10.4.3 Enabling or Disabling the Local APIC When IA32_APIC_BASE[11] is set to 0, prior initialization to the APIC may be lost and the APIC may return to the state described in Section 10.4.7.1, “Local APIC State After Power-Up or Reset.” 10.4.7.1 Local APIC State After Power-Up or Reset ... The local APIC ID register is set to a unique APIC ID. ... i.e. KVM's behavior is legal as per Intel's architecture. In practice, Intel's behavior is N/A as modern Intel CPUs (since at least Haswell) make the xAPIC ID fully read-only. And for xAPIC => x2APIC transitions (commit 257b9a5faab5 ("KVM: x86: use correct APIC ID on x2APIC transition")), Intel's SDM says: Any APIC ID value written to the memory-mapped local APIC ID register is not preserved. AMD's APM says nothing (that I could find) about the xAPIC ID when the APIC is DISABLED, but testing on bare metal (Rome) shows that the xAPIC ID is preserved when the APIC is DISABLED and re-enabled in xAPIC mode. AMD also preserves the xAPIC ID when the APIC is transitioned from xAPIC to x2APIC, i.e. allows a backdoor write of the x2APIC ID, which is again not emulated by KVM. Signed-off-by: Emanuele Giuseppe Esposito Link: https://lore.kernel.org/all/20230109130605.2013555-2-eesposit@redhat.com [sean: rewrite changelog, set xAPIC ID iff APIC is enabled] Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c28ba0275580..e542cf285b51 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2510,8 +2510,12 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } - if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) - kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + else if (value & MSR_IA32_APICBASE_ENABLE) + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); -- cgit v1.2.3 From 974850be012583fb8e7f1bd5ecf55763efb2f94a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 9 Nov 2022 16:28:01 +0800 Subject: KVM: x86/pmu: Add PRIR++ and PDist support for SPR and later models The pebs capability on the SPR is basically the same as Ice Lake Server with the exception of two special facilities that have been enhanced and require special handling. Upon triggering a PEBS assist, there will be a finite delay between the time the counter overflows and when the microcode starts to carry out its data collection obligations. Even if the delay is constant in core clock space, it invariably manifest as variable "skids" in instruction address space. On the Ice Lake Server, the Precise Distribution of Instructions Retire (PDIR) facility mitigates the "skid" problem by providing an early indication of when the counter is about to overflow. On SPR, the PDIR counter available (Fixed 0) is unchanged, but the capability is enhanced to Instruction-Accurate PDIR (PDIR++), where PEBS is taken on the next instruction after the one that caused the overflow. SPR also introduces a new Precise Distribution (PDist) facility only on general programmable counter 0. Per Intel SDM, PDist eliminates any skid or shadowing effects from PEBS. With PDist, the PEBS record will be generated precisely upon completion of the instruction or operation that causes the counter to overflow (there is no "wait for next occurrence" by default). In terms of KVM handling, when guest accesses those special counters, the KVM needs to request the same index counters via the perf_event kernel subsystem to ensure that the guest uses the correct pebs hardware counter (PRIR++ or PDist). This is mainly achieved by adjusting the event precise level to the maximum, where the semantics of this magic number is mainly defined by the internal software context of perf_event and it's also backwards compatible as part of the user space interface. Opportunistically, refine confusing comments on TNT+, as the only ones that currently support pebs_ept are Ice Lake server and SPR (GLC+). Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20221109082802.27543-3-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3264f8e0e8ef..7b6c3ba2c8e1 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,9 +29,18 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { +/* Precise Distribution of Instructions Retired (PDIR) */ +static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + /* Instruction-Accurate PDIR (PDIR++) */ + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + {} +}; + +/* Precise Distribution (PDist) */ +static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), {} }; @@ -156,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event, kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) +{ + /* + * For some model specific pebs counters with special capabilities + * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise + * level to the maximum value (currently 3, backwards compatible) + * so that the perf subsystem would assign specific hardware counter + * with that capability for vPMC. + */ + if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || + (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) + return 3; + + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + */ + return 1; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -187,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, } if (pebs) { /* - * The non-zero precision level of guest event makes the ordinary - * guest event becomes a guest PEBS event and triggers the host - * PEBS PMI handler to determine whether the PEBS overflow PMI - * comes from the host counters or the guest. - * * For most PEBS hardware events, the difference in the software * precision levels of guest and host PEBS events will not affect * the accuracy of the PEBS profiling result, because the "event IP" * in the PEBS record is calibrated on the guest side. - * - * On Icelake everything is fine. Other hardware (GLC+, TNT+) that - * could possibly care here is unsupported and needs changes. */ - attr.precise_ip = 1; - if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) - attr.precise_ip = 3; + attr.precise_ip = pmc_get_pebs_precise_level(pmc); } event = perf_event_create_kernel_counter(&attr, -1, current, -- cgit v1.2.3 From 13738a3647368f7f600b30d241779bcd2a3ebbfd Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 9 Nov 2022 16:28:02 +0800 Subject: perf/x86/intel: Expose EPT-friendly PEBS for SPR and future models According to Intel SDM, the EPT-friendly PEBS is supported by all the platforms after ICX, ADL and the future platforms with PEBS format 5. Currently the only in-kernel user of this capability is KVM, which has very limited support for hybrid core pmu, so ADL and its successors do not currently expose this capability. When both hybrid core and PEBS format 5 are present, KVM will decide on its own merits. Cc: Peter Zijlstra Cc: linux-perf-users@vger.kernel.org Suggested-by: Kan Liang Signed-off-by: Like Xu Reviewed-by: Kan Liang Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221109082802.27543-4-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/events/intel/core.c | 1 + arch/x86/events/intel/ds.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dfd2c124cdf8..aa53d042b943 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6348,6 +6348,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; x86_pmu.extra_regs = intel_spr_extra_regs; x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 88e58b6ee73c..d8a404b91b7e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2303,8 +2303,10 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; - case 4: case 5: + x86_pmu.pebs_ept = 1; + fallthrough; + case 4: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { -- cgit v1.2.3 From 0735d1c34e49bc79d4a9860651d10c00b0692276 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 26 Jan 2023 02:34:03 +0100 Subject: KVM: x86/emulator: Fix segment load privilege level validation Intel SDM describes what steps are taken by the CPU to verify if a memory segment can actually be used at a given privilege level. Loading DS/ES/FS/GS involves checking segment's type as well as making sure that neither selector's RPL nor caller's CPL are greater than segment's DPL. Emulator implements Intel's pseudocode in __load_segment_descriptor(), even quoting the pseudocode in the comments. Although the pseudocode is correctly translated, the implementation is incorrect. This is most likely due to SDM, at the time, being wrong. Patch fixes emulator's logic and updates the pseudocode in the comment. Below are historical notes. Emulator code for handling segment descriptors appears to have been introduced in March 2010 in commit 38ba30ba51a0 ("KVM: x86 emulator: Emulate task switch in emulator.c"). Intel SDM Vol 2A: Instruction Set Reference, A-M (Order Number: 253666-034US, _March 2010_) lists the steps for loading segment registers in section related to MOV instruction: IF DS, ES, FS, or GS is loaded with non-NULL selector THEN IF segment selector index is outside descriptor table limits or segment is not a data or readable code segment or ((segment is a data or nonconforming code segment) and (both RPL and CPL > DPL)) <--- THEN #GP(selector); FI; This is precisely what __load_segment_descriptor() quotes and implements. But there's a twist; a few SDM revisions later (253667-044US), in August 2012, the snippet above becomes: IF DS, ES, FS, or GS is loaded with non-NULL selector THEN IF segment selector index is outside descriptor table limits or segment is not a data or readable code segment or ((segment is a data or nonconforming code segment) [note: missing or superfluous parenthesis?] or ((RPL > DPL) and (CPL > DPL)) <--- THEN #GP(selector); FI; Many SDMs later (253667-065US), in December 2017, pseudocode reaches what seems to be its final form: IF DS, ES, FS, or GS is loaded with non-NULL selector THEN IF segment selector index is outside descriptor table limits OR segment is not a data or readable code segment OR ((segment is a data or nonconforming code segment) AND ((RPL > DPL) or (CPL > DPL))) <--- THEN #GP(selector); FI; which also matches the behavior described in AMD's APM, which states that a #GP occurs if: The DS, ES, FS, or GS register was loaded and the segment pointed to was a data or non-conforming code segment, but the RPL or CPL was greater than the DPL. Signed-off-by: Michal Luczaj Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230126013405.2967156-2-mhal@rbox.co [sean: add blurb to changelog calling out AMD agrees] Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c3443045cd93..43df65ef5c29 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1696,11 +1696,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* * segment is not a data or readable code segment or * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) + * and ((RPL > DPL) or (CPL > DPL))) */ if ((seg_desc.type & 0xa) == 0x8 || (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) + (rpl > dpl || cpl > dpl))) goto exception; break; } -- cgit v1.2.3 From 096691e0d2a1a97ce5a0d68cffdc84ce97bce304 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 26 Jan 2023 02:34:04 +0100 Subject: KVM: x86/emulator: Fix comment in __load_segment_descriptor() The comment refers to the same condition twice. Make it reflect what the code actually does. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230126013405.2967156-3-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 43df65ef5c29..a630c5db971c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1634,7 +1634,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_SS: /* * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL + * selector's RPL != CPL or DPL != CPL */ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) goto exception; -- cgit v1.2.3 From 95744a90db18437410aa94620b8a330311bd9cf6 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:51 +0100 Subject: KVM: x86: Optimize kvm->lock and SRCU interaction (KVM_SET_PMU_EVENT_FILTER) Reduce time spent holding kvm->lock: unlock mutex before calling synchronize_srcu_expedited(). There is no need to hold kvm->lock until all vCPUs have been kicked, KVM only needs to guarantee that all vCPUs will switch to the new filter before exiting to userspace. Protecting the write to __reprogram_pmi is also unnecessary as a vCPU may process a set bit before receiving the final KVM_REQ_PMU, but the per-vCPU writes are guaranteed to occur after all vCPUs have switched to the new filter. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-2-mhal@rbox.co [sean: expand changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d939d3b84e6f..58e5a456273a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -634,6 +634,7 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > @@ -644,8 +645,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); - mutex_unlock(&kvm->lock); - r = 0; cleanup: kfree(filter); -- cgit v1.2.3 From 708f799d22fe6b69d2e6e3e0625c30666d5e798a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:52 +0100 Subject: KVM: x86: Optimize kvm->lock and SRCU interaction (KVM_X86_SET_MSR_FILTER) Reduce time spent holding kvm->lock: unlock mutex before calling synchronize_srcu(). There is no need to hold kvm->lock until all vCPUs have been kicked, KVM only needs to guarantee that all vCPUs will switch to the new filter before exiting to userspace. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-3-mhal@rbox.co [sean: expand changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d7d5bca00294..4f8e78d49585 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6497,12 +6497,12 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); kvm_free_msr_filter(old_filter); kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); - mutex_unlock(&kvm->lock); return 0; } -- cgit v1.2.3 From 4d85cfcaa82f0ceb5dcb7ad369a2cc7efb32c65c Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:53 +0100 Subject: KVM: x86: Simplify msr_filter update Replace srcu_dereference()+rcu_assign_pointer() sequence with a single rcu_replace_pointer(). Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-4-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f8e78d49585..9c66cb3657b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6492,11 +6492,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, } mutex_lock(&kvm->lock); - /* The per-VM filter is protected by kvm->lock... */ - old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); - - rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, 1); mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); -- cgit v1.2.3 From 1fdefb8bd862d7c17fc2526ec9fdfb080c15da45 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:54 +0100 Subject: KVM: x86: Explicitly state lockdep condition of msr_filter update Replace `1` with the actual mutex_is_locked() check. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-5-mhal@rbox.co [sean: delete the comment that explained the hardocded '1'] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c66cb3657b4..00e4bf16bbfd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6492,8 +6492,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, } mutex_lock(&kvm->lock); - /* The per-VM filter is protected by kvm->lock... */ - old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, 1); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, + mutex_is_locked(&kvm->lock)); mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); -- cgit v1.2.3 From 4559e6cf45b555e7f2d73dd6a09a23afcbac9ec1 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:55 +0100 Subject: KVM: x86: Remove unnecessary initialization in kvm_vm_ioctl_set_msr_filter() Do not initialize the value of `r`, as it will be overwritten. Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-6-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00e4bf16bbfd..6a28f1347ff5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6466,7 +6466,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_x86_msr_filter *new_filter, *old_filter; bool default_allow; bool empty = true; - int r = 0; + int r; u32 i; if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) -- cgit v1.2.3 From e73ba25fdc241c06ab48a1f708a30305d6036e66 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sat, 7 Jan 2023 01:12:56 +0100 Subject: KVM: x86: Simplify msr_io() As of commit bccf2150fe62 ("KVM: Per-vcpu inodes"), __msr_io() doesn't return a negative value. Remove unnecessary checks. Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230107001256.2365304-7-mhal@rbox.co [sean: call out commit which left behind the unnecessary check] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6a28f1347ff5..810cbe3b3ba6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4291,8 +4291,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, { struct kvm_msrs msrs; struct kvm_msr_entry *entries; - int r, n; unsigned size; + int r; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) @@ -4309,17 +4309,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, goto out; } - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; + r = __msr_io(vcpu, &msrs, entries, do_msr); - r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; + r = -EFAULT; -out_free: kfree(entries); out: return r; -- cgit v1.2.3 From 41acdd41973548aec573381e1166b5a388708d5b Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 9 Nov 2022 15:54:12 +0800 Subject: KVM: VMX: Do not trap VMFUNC instructions for L1 guests. Explicitly disable VMFUNC in vmcs01 to document that KVM doesn't support any VM-Functions for L1. WARN in the dedicated VMFUNC handler if an exit occurs while L1 is active, but keep the existing handlers as fallbacks to avoid killing the VM as an unexpected VMFUNC VM-Exit isn't fatal Signed-off-by: Yu Zhang Link: https://lore.kernel.org/r/20221109075413.1405803-2-yu.c.zhang@linux.intel.com [sean: don't kill the VM on an unexpected VMFUNC from L1, reword changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 7 +++---- arch/x86/kvm/vmx/vmx.c | 6 ++++++ 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 557b9c468734..3c226de4b562 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5864,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) u32 function = kvm_rax_read(vcpu); /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. + * VMFUNC should never execute cleanly while L1 is active; KVM supports + * VMFUNC for nested VMs, but not for L1. */ - if (!is_guest_mode(vcpu)) { + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 664994e3e909..8a9911ae1240 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4590,6 +4590,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* + * KVM doesn't support VMFUNC for L1, but the control is set in KVM's + * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. + */ + exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; -- cgit v1.2.3 From 496c917b0989a7a20f9804de14ab2b3cbd6747a1 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 9 Nov 2022 15:54:13 +0800 Subject: KVM: nVMX: Simplify the setting of SECONDARY_EXEC_ENABLE_VMFUNC for nested. Values of base settings for nested proc-based VM-Execution control MSR come from the ones for non-nested. And for SECONDARY_EXEC_ENABLE_VMFUNC flag, KVM currently a) first mask off it from vmcs_conf->cpu_based_2nd_exec_ctrl; b) then check it against the same source; c) and reset it again if host has it. So just simplify this, by not masking off SECONDARY_EXEC_ENABLE_VMFUNC in the first place. No functional change. Signed-off-by: Yu Zhang Link: https://lore.kernel.org/r/20221109075413.1405803-3-yu.c.zhang@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3c226de4b562..7c4f5ca405c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6880,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_TSC_SCALING | @@ -6912,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } - } - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; /* - * Advertise EPTP switching unconditionally - * since we emulate it + * Advertise EPTP switching irrespective of hardware support, + * KVM emulates it in software so long as VMFUNC is supported. */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (cpu_has_vmx_vmfunc()) + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* -- cgit v1.2.3 From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 24 Jan 2023 00:12:08 +0200 Subject: KVM: VMX: Fix crash due to uninitialized current_vmcs KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as a nested hypervisor on top of Hyper-V. When MSR bitmap is updated, evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark that the msr bitmap was changed. vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr -> vmx_msr_bitmap_l01_changed which in the end calls this function. The function checks for current_vmcs if it is null but the check is insufficient because current_vmcs is not initialized. Because of this, the code might incorrectly write to the structure pointed by current_vmcs value left by another task. Preemption is not disabled, the current task can be preempted and moved to another CPU while current_vmcs is accessed multiple times from evmcs_touch_msr_bitmap() which leads to crash. The manipulation of MSR bitmaps by callers happens only for vmcs01 so the solution is to use vmx->vmcs01.vmcs instead of current_vmcs. BUG: kernel NULL pointer dereference, address: 0000000000000338 PGD 4e1775067 P4D 0 Oops: 0002 [#1] PREEMPT SMP NOPTI ... RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel] ... Call Trace: vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel] vmx_vcpu_create+0xe6/0x540 [kvm_intel] kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm] kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm] kvm_vm_ioctl+0x53f/0x790 [kvm] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Alexandru Matei Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv.h | 11 ----------- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index caf658726169..78d17667e7ec 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field) return *(u16 *)((char *)current_evmcs + offset); } -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - static inline void evmcs_load(u64 phys_addr) { struct hv_vp_assist_page *vp_ap = @@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ #define EVMPTR_INVALID (-1ULL) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8a9911ae1240..33614ee2cd67 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + if (evmcs->hv_enlightenments_control.msr_bitmap) + evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + } vmx->nested.force_msr_bitmap_recalc = true; } -- cgit v1.2.3 From f94f053aa3a5d6ff17951870483d9eb9e13de2e2 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 7 Feb 2023 09:13:54 -0800 Subject: KVM: SVM: Fix potential overflow in SEV's send|receive_update_data() KVM_SEV_SEND_UPDATE_DATA and KVM_SEV_RECEIVE_UPDATE_DATA have an integer overflow issue. Params.guest_len and offset are both 32 bits wide, with a large params.guest_len the check to confirm a page boundary is not crossed can falsely pass: /* Check if we are crossing the page boundary * offset = params.guest_uaddr & (PAGE_SIZE - 1); if ((params.guest_len + offset > PAGE_SIZE)) Add an additional check to confirm that params.guest_len itself is not greater than PAGE_SIZE. Note, this isn't a security concern as overflow can happen if and only if params.guest_len is greater than 0xfffff000, and the FW spec says these commands fail with lengths greater than 16KB, i.e. the PSP will detect KVM's goof. Fixes: 15fb7de1a7f5 ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command") Fixes: d3d1af85e2c7 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command") Reported-by: Andy Nguyen Suggested-by: Thomas Lendacky Signed-off-by: Peter Gonda Cc: David Rientjes Cc: Paolo Bonzini Cc: Sean Christopherson Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20230207171354.4012821-1-pgonda@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a5e4c5ef7c9e..c25aeb550cd9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1294,7 +1294,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; /* Pin guest memory */ @@ -1474,7 +1474,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); -- cgit v1.2.3 From 7f604e92fb805d2569aa5fb177a1c7231ea2f0cc Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 13 Feb 2023 13:28:44 -0800 Subject: KVM: x86/mmu: Make tdp_mmu_allowed static Make tdp_mmu_allowed static since it is only ever used within arch/x86/kvm/mmu/mmu.c. Link: https://lore.kernel.org/kvm/202302072055.odjDVd5V-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: David Matlack Message-Id: <20230213212844.3062733-1-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c91ee2927dd7..c8ebe542c565 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -101,7 +101,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -bool __ro_after_init tdp_mmu_allowed; +static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; -- cgit v1.2.3 From 45dd9bc75d9adc9483f0c7d662ba6e73ed698a0b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 21 Feb 2023 23:33:15 -0800 Subject: KVM: SVM: hyper-v: placate modpost section mismatch error modpost reports section mismatch errors/warnings: WARNING: modpost: vmlinux.o: section mismatch in reference: svm_hv_hardware_setup (section: .text) -> (unknown) (section: .init.data) WARNING: modpost: vmlinux.o: section mismatch in reference: svm_hv_hardware_setup (section: .text) -> (unknown) (section: .init.data) WARNING: modpost: vmlinux.o: section mismatch in reference: svm_hv_hardware_setup (section: .text) -> (unknown) (section: .init.data) This "(unknown) (section: .init.data)" all refer to svm_x86_ops. Tag svm_hv_hardware_setup() with __init to fix a modpost warning as the non-stub implementation accesses __initdata (svm_x86_ops), i.e. would generate a use-after-free if svm_hv_hardware_setup() were actually invoked post-init. The helper is only called from svm_hardware_setup(), which is also __init, i.e. lack of __init is benign other than the modpost warning. Fixes: 1e0c7d40758b ("KVM: SVM: hyper-v: Remote TLB flush for SVM") Signed-off-by: Randy Dunlap Cc: Vineeth Pillai Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Message-Id: <20230222073315.9081-1-rdunlap@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm_onhyperv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 6981c1e9a809..cff838f15db5 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -30,7 +30,7 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) hve->hv_enlightenments_control.msr_bitmap = 1; } -static inline void svm_hv_hardware_setup(void) +static inline __init void svm_hv_hardware_setup(void) { if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { @@ -84,7 +84,7 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { } -static inline void svm_hv_hardware_setup(void) +static inline __init void svm_hv_hardware_setup(void) { } -- cgit v1.2.3