summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/cpuid.c39
-rw-r--r--arch/x86/kvm/emulate.c355
-rw-r--r--arch/x86/kvm/hyperv.c353
-rw-r--r--arch/x86/kvm/hyperv.h64
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/kvm_emulate.h48
-rw-r--r--arch/x86/kvm/lapic.c16
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c189
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h33
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/mmu/spte.c12
-rw-r--r--arch/x86/kvm/mmu/spte.h19
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c114
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/pmu.c90
-rw-r--r--arch/x86/kvm/pmu.h6
-rw-r--r--arch/x86/kvm/reverse_cpuid.h28
-rw-r--r--arch/x86/kvm/smm.c649
-rw-r--r--arch/x86/kvm/smm.h168
-rw-r--r--arch/x86/kvm/svm/hyperv.c18
-rw-r--r--arch/x86/kvm/svm/hyperv.h50
-rw-r--r--arch/x86/kvm/svm/nested.c64
-rw-r--r--arch/x86/kvm/svm/pmu.c4
-rw-r--r--arch/x86/kvm/svm/sev.c6
-rw-r--r--arch/x86/kvm/svm/svm.c79
-rw-r--r--arch/x86/kvm/svm/svm.h5
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c8
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h25
-rw-r--r--arch/x86/kvm/svm/vmenter.S1
-rw-r--r--arch/x86/kvm/trace.h36
-rw-r--r--arch/x86/kvm/vmx/capabilities.h24
-rw-r--r--arch/x86/kvm/vmx/hyperv.c (renamed from arch/x86/kvm/vmx/evmcs.c)45
-rw-r--r--arch/x86/kvm/vmx/hyperv.h (renamed from arch/x86/kvm/vmx/evmcs.h)12
-rw-r--r--arch/x86/kvm/vmx/nested.c112
-rw-r--r--arch/x86/kvm/vmx/nested.h7
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c11
-rw-r--r--arch/x86/kvm/vmx/sgx.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c104
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h20
-rw-r--r--arch/x86/kvm/x86.c560
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c537
-rw-r--r--arch/x86/kvm/xen.h13
49 files changed, 2567 insertions, 1408 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 67be7f217e37..fbeaa9ddef59 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -118,6 +118,17 @@ config KVM_AMD_SEV
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
config KVM_XEN
bool "Support for Xen hypercall interface"
depends on KVM
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f453a0f96e24..80e3fe184d17 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -20,12 +20,14 @@ endif
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
+ svm/sev.o svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-amd-y += svm/svm_onhyperv.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 62bc7a01cecc..b14653b61470 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -62,10 +62,16 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
* This one is tied to SSB in the user API, and not
* visible in /proc/cpuinfo.
*/
-#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
#define F feature_bit
-#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SF(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
+})
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
@@ -543,9 +549,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
}
static __always_inline
-void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
BUILD_BUG_ON(leaf < NCAPINTS);
kvm_cpu_caps[leaf] = mask;
@@ -555,7 +561,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
BUILD_BUG_ON(leaf >= NCAPINTS);
kvm_cpu_caps[leaf] &= mask;
@@ -657,15 +663,20 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX_VNNI) | F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
+ F(AVX_IFMA)
+ );
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
- kvm_cpu_cap_init_scattered(CPUID_12_EAX,
- SF(SGX1) | SF(SGX2)
+ kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
@@ -694,7 +705,7 @@ void kvm_set_cpu_caps(void)
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- __feature_bit(KVM_X86_FEATURE_PSFD)
+ __feature_bit(KVM_X86_FEATURE_AMD_PSFD)
);
/*
@@ -913,9 +924,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
entry->ebx = 0;
entry->ecx = 0;
- entry->edx = 0;
}
break;
case 0xa: { /* Architectural Performance Monitoring */
@@ -1047,9 +1058,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
* expected to derive it from supported XCR0.
*/
- entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
- SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
- SGX_ATTR_KSS;
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
entry->ebx &= 0;
break;
/* Intel PT */
@@ -1222,8 +1231,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* Other defined bits are for MSRs that KVM does not expose:
* EAX 3 SPCL, SMM page configuration lock
* EAX 13 PCMSR, Prefetch control MSR
+ *
+ * KVM doesn't support SMM_CTL.
+ * EAX 9 SMM_CTL MSR is not supported
*/
entry->eax &= BIT(0) | BIT(2) | BIT(6);
+ entry->eax |= BIT(9);
if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
entry->eax |= BIT(2);
if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4a43261d25a2..5cc3efa0e21c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -242,37 +242,6 @@ enum x86_transfer_type {
X86_TRANSFER_TASK_SWITCH,
};
-static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- if (!(ctxt->regs_valid & (1 << nr))) {
- ctxt->regs_valid |= 1 << nr;
- ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
- }
- return ctxt->_regs[nr];
-}
-
-static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
- BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
-
- ctxt->regs_valid |= 1 << nr;
- ctxt->regs_dirty |= 1 << nr;
- return &ctxt->_regs[nr];
-}
-
-static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- reg_read(ctxt, nr);
- return reg_write(ctxt, nr);
-}
-
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
unsigned long dirty = ctxt->regs_dirty;
@@ -2338,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
-{
-#ifdef CONFIG_X86_64
- return ctxt->ops->guest_has_long_mode(ctxt);
-#else
- return false;
-#endif
-}
-
-static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
-{
- desc->g = (flags >> 23) & 1;
- desc->d = (flags >> 22) & 1;
- desc->l = (flags >> 21) & 1;
- desc->avl = (flags >> 20) & 1;
- desc->p = (flags >> 15) & 1;
- desc->dpl = (flags >> 13) & 3;
- desc->s = (flags >> 12) & 1;
- desc->type = (flags >> 8) & 15;
-}
-
-static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
- else
- offset = 0x7f2c + (n - 3) * 12;
-
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
- return X86EMUL_CONTINUE;
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
- u32 base3;
-
- offset = 0x7e00 + n * 16;
-
- selector = GET_SMSTATE(u16, smstate, offset);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- base3 = GET_SMSTATE(u32, smstate, offset + 12);
-
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
- return X86EMUL_CONTINUE;
-}
-#endif
-
-static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
- u64 cr0, u64 cr3, u64 cr4)
-{
- int bad;
- u64 pcid;
-
- /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
- pcid = 0;
- if (cr4 & X86_CR4_PCIDE) {
- pcid = cr3 & 0xfff;
- cr3 &= ~0xfff;
- }
-
- bad = ctxt->ops->set_cr(ctxt, 3, cr3);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- /*
- * First enable PAE, long mode needs it before CR0.PG = 1 is set.
- * Then enable protected mode. However, PCID cannot be enabled
- * if EFER.LMA=0, so set it separately.
- */
- bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- bad = ctxt->ops->set_cr(ctxt, 0, cr0);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- if (cr4 & X86_CR4_PCIDE) {
- bad = ctxt->ops->set_cr(ctxt, 4, cr4);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- if (pcid) {
- bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- }
-
- }
-
- return X86EMUL_CONTINUE;
-}
-
-static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u16 selector;
- u32 val, cr0, cr3, cr4;
- int i;
-
- cr0 = GET_SMSTATE(u32, smstate, 0x7ffc);
- cr3 = GET_SMSTATE(u32, smstate, 0x7ff8);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
- ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
-
- for (i = 0; i < 8; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
-
- val = GET_SMSTATE(u32, smstate, 0x7fcc);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u32, smstate, 0x7fc8);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc4);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc0);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f74);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f70);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f58);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f54);
- ctxt->ops->set_idt(ctxt, &dt);
-
- for (i = 0; i < 6; i++) {
- int r = rsm_load_seg_32(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- cr4 = GET_SMSTATE(u32, smstate, 0x7f14);
-
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8));
-
- return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u64 val, cr0, cr3, cr4;
- u32 base3;
- u16 selector;
- int i, r;
-
- for (i = 0; i < 16; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
-
- ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
-
- val = GET_SMSTATE(u64, smstate, 0x7f68);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u64, smstate, 0x7f60);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
- cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
- cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
- val = GET_SMSTATE(u64, smstate, 0x7ed0);
-
- if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7e90);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98));
- base3 = GET_SMSTATE(u32, smstate, 0x7e9c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e84);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e88);
- ctxt->ops->set_idt(ctxt, &dt);
-
- selector = GET_SMSTATE(u32, smstate, 0x7e70);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78));
- base3 = GET_SMSTATE(u32, smstate, 0x7e7c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e64);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e68);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
- if (r != X86EMUL_CONTINUE)
- return r;
-
- for (i = 0; i < 6; i++) {
- r = rsm_load_seg_64(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- return X86EMUL_CONTINUE;
-}
-#endif
-
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- unsigned long cr0, cr4, efer;
- char buf[512];
- u64 smbase;
- int ret;
-
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
- smbase = ctxt->ops->get_smbase(ctxt);
-
- ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf));
- if (ret != X86EMUL_CONTINUE)
- return X86EMUL_UNHANDLEABLE;
-
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
- ctxt->ops->set_nmi_mask(ctxt, false);
-
- ctxt->ops->exiting_smm(ctxt);
-
- /*
- * Get back to real mode, to prepare a safe state in which to load
- * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
- * supports long mode.
- */
- if (emulator_has_longmode(ctxt)) {
- struct desc_struct cs_desc;
-
- /* Zero CR4.PCIDE before CR0.PG. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PCIDE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
-
- /* A 32-bit code segment is required to clear EFER.LMA. */
- memset(&cs_desc, 0, sizeof(cs_desc));
- cs_desc.type = 0xb;
- cs_desc.s = cs_desc.g = cs_desc.p = 1;
- ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
- }
-
- /* For the 64-bit case, this will clear EFER.LMA. */
- cr0 = ctxt->ops->get_cr(ctxt, 0);
- if (cr0 & X86_CR0_PE)
- ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-
- if (emulator_has_longmode(ctxt)) {
- /* Clear CR4.PAE before clearing EFER.LME. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
-
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
- }
-
- /*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
- */
- if (ctxt->ops->leave_smm(ctxt, buf))
- goto emulate_shutdown;
-
-#ifdef CONFIG_X86_64
- if (emulator_has_longmode(ctxt))
- ret = rsm_load_state_64(ctxt, buf);
- else
-#endif
- ret = rsm_load_state_32(ctxt, buf);
-
- if (ret != X86EMUL_CONTINUE)
- goto emulate_shutdown;
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
- /*
- * Note, the ctxt->ops callbacks are responsible for handling side
- * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
- * runtime updates, etc... If that changes, e.g. this flow is moved
- * out of the emulator to make it look more like enter_smm(), then
- * those side effects need to be explicitly handled for both success
- * and shutdown.
- */
return emulator_recalc_and_set_mode(ctxt);
-
-emulate_shutdown:
- ctxt->ops->triple_fault(ctxt);
- return X86EMUL_CONTINUE;
}
static void
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 0adf4a437e85..2c7f2a26421e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,22 +23,25 @@
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "mmu.h"
#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
#include <trace/events/kvm.h>
#include "trace.h"
#include "irq.h"
#include "fpu.h"
-#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick);
@@ -897,13 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page)
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
{
- if (!kvm_hv_assist_page_enabled(vcpu))
- return false;
- return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
- assist_page, sizeof(*assist_page));
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
}
EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
@@ -954,6 +959,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
hv_vcpu->vp_index = vcpu->vcpu_idx;
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
return 0;
}
@@ -1736,6 +1746,28 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
}
}
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
struct kvm_hv_hcall {
u64 param;
u64 ingpa;
@@ -1749,57 +1781,173 @@ struct kvm_hv_hcall {
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
};
-static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
- int consumed_xmm_halves,
- u64 *sparse_banks, gpa_t offset)
-{
- u16 var_cnt;
- int i;
-
- if (hc->var_cnt > 64)
- return -EINVAL;
- /* Ignore banks that cannot possibly contain a legal VP index. */
- var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data,
+ int consumed_xmm_halves, gpa_t offset)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
if (hc->fast) {
/*
* Each XMM holds two sparse banks, but do not count halves that
* have already been consumed for hypercall parameters.
*/
- if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < var_cnt; i++) {
- int j = i + consumed_xmm_halves;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + consumed_xmm_halves;
if (j % 2)
- sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ data[i] = sse128_hi(hc->xmm[j / 2]);
else
- sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ data[i] = sse128_lo(hc->xmm[j / 2]);
}
return 0;
}
- return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
- var_cnt * sizeof(*sparse_banks));
+ return kvm_read_guest(kvm, hc->ingpa + offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks, int consumed_xmm_halves,
+ gpa_t offset)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks, consumed_xmm_halves, offset);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
+ int consumed_xmm_halves, gpa_t offset)
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt,
+ entries, consumed_xmm_halves, offset);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
}
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *v;
+ unsigned long i;
bool all_cpus;
+ int consumed_xmm_halves = 0;
+ gpa_t data_offset;
/*
- * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
- * valid mask is a u64. Fail the build if KVM's max allowed number of
- * vCPUs (>4096) would exceed this limit, KVM will additional changes
- * for Hyper-V support to avoid setting the guest up to fail.
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
*/
- BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
@@ -1807,14 +1955,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
flush.processor_mask = sse128_lo(hc->xmm[0]);
+ consumed_xmm_halves = 1;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa,
&flush, sizeof(flush))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ data_offset = sizeof(flush);
}
trace_kvm_hv_flush_tlb(flush.processor_mask,
- flush.address_space, flush.flags);
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
@@ -1834,16 +1985,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush_ex.flags = hc->outgpa;
memcpy(&flush_ex.hv_vp_set,
&hc->xmm[0], sizeof(hc->xmm[0]));
+ consumed_xmm_halves = 2;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
sizeof(flush_ex))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ data_offset = sizeof(flush_ex);
}
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
flush_ex.hv_vp_set.format,
flush_ex.address_space,
- flush_ex.flags);
+ flush_ex.flags, is_guest_mode(vcpu));
valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
all_cpus = flush_ex.hv_vp_set.format !=
@@ -1852,29 +2005,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (hc->var_cnt != hweight64(valid_bank_mask))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus)
- goto do_flush;
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
- if (!hc->var_cnt)
- goto ret_success;
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
+ consumed_xmm_halves, data_offset))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
- if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
- offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents)))
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ consumed_xmm_halves += hc->var_cnt;
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries,
+ consumed_xmm_halves, data_offset))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
}
-do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- if (all_cpus) {
- kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
- } else {
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
}
ret_success:
@@ -1883,8 +2102,8 @@ ret_success:
((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
-static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
- unsigned long *vcpu_bitmap)
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
{
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
@@ -1894,7 +2113,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
continue;
/* We fail only when APIC is disabled */
@@ -1904,12 +2125,12 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
@@ -1959,7 +2180,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (!hc->var_cnt)
goto ret_success;
- if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1,
offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -1969,13 +2190,10 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus) {
- kvm_send_ipi_to_many(kvm, vector, NULL);
- } else {
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
-
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
- }
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
ret_success:
return HV_STATUS_SUCCESS;
@@ -2062,10 +2280,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
trace_kvm_hv_hypercall_done(result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
- return kvm_skip_emulated_instruction(vcpu);
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
}
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
@@ -2502,6 +2735,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->ebx |= HV_DEBUGGING;
ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
/*
* Direct Synthetic timers only make sense with in-kernel
@@ -2545,6 +2779,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
ent->eax |= HV_X64_NESTED_MSR_BITMAP;
ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 1030b1b50552..9f96414a31c5 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -22,6 +22,7 @@
#define __ARCH_X86_KVM_HYPERV_H__
#include <linux/kvm_host.h>
+#include "x86.h"
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -107,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
int timer_index)
@@ -151,4 +151,64 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f371f1292ca3..a70952eca905 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -31,7 +31,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
return r;
}
-EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
* check if there is a pending userspace external interrupt
@@ -150,7 +149,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_inject_timer_irqs(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
@@ -165,3 +163,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3febc342360c..c09174f73a34 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
-static inline bool is_smm(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hflags & HF_SMM_MASK;
-}
-
#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 89246446d6aa..2d9662be8333 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -117,16 +117,6 @@ struct x86_emulate_ops {
struct x86_exception *fault, bool system);
/*
- * read_phys: Read bytes of standard (non-emulated/special) memory.
- * Used for descriptor reading.
- * @addr: [IN ] Physical address from which to read.
- * @val: [OUT] Value read from memory.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
- void *val, unsigned int bytes);
-
- /*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.
* @addr: [IN ] Linear address to which to write.
@@ -209,11 +199,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
- u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
- void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
@@ -234,8 +221,7 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
- void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
- int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
};
@@ -292,7 +278,6 @@ enum x86emul_mode {
/* These match some of the HF_* flags defined in kvm_host.h */
#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
#define X86EMUL_SMM_MASK (1 << 6)
-#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
/*
* fastop functions are declared as taking a never-defined fastop parameter,
@@ -526,4 +511,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d7639d126e6c..4efdb4a4d72c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -42,6 +42,7 @@
#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "smm.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -159,7 +160,6 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
&& !(kvm_mwait_in_guest(vcpu->kvm) ||
kvm_can_post_timer_interrupt(vcpu));
}
-EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer);
static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
{
@@ -1170,9 +1170,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- result = 1;
- kvm_make_request(KVM_REQ_SMI, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
break;
case APIC_DM_NMI:
@@ -1912,7 +1913,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
@@ -2430,7 +2430,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
}
-EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
@@ -2722,8 +2721,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
- } else {
- kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
@@ -2759,6 +2756,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
+ if (!apic_x2apic_mode(apic))
+ kvm_lapic_xapic_id_updated(apic);
+
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a5ac4a5a5179..28e3769066e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -7,7 +7,7 @@
#include <linux/kvm_host.h>
#include "hyperv.h"
-#include "kvm_cache_regs.h"
+#include "smm.h"
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1ccb769f62af..835426254e76 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -22,6 +22,7 @@
#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "smm.h"
#include "kvm_emulate.h"
#include "cpuid.h"
#include "spte.h"
@@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
}
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- if (sp->lpage_disallowed)
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
return;
++kvm->stat.nx_lpage_splits;
- list_add_tail(&sp->lpage_disallowed_link,
- &kvm->arch.lpage_disallowed_mmu_pages);
- sp->lpage_disallowed = true;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
--kvm->stat.nx_lpage_splits;
- sp->lpage_disallowed = false;
- list_del(&sp->lpage_disallowed_link);
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp);
}
static struct kvm_memory_slot *
@@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *pos;
u64 *end;
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
@@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(ent);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.invalid)
return true;
- /* TDP MMU pages due not use the MMU generation. */
+ /* TDP MMU pages do not use the MMU generation. */
return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
@@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(*sptep);
if (child->role.access == direct_access)
return;
@@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
} else {
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, spte);
/*
@@ -2443,6 +2471,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
{
bool list_unstable, zapped_root = false;
+ lockdep_assert_held_write(&kvm->mmu_lock);
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
@@ -2486,8 +2515,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
zapped_root = !is_obsolete_sp(kvm, sp);
}
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
sp->role.invalid = 1;
@@ -2810,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -3084,7 +3113,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
if (cur_level > PG_LEVEL_4K &&
cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
- !is_large_pte(spte)) {
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
* A small SPTE exists for this pfn, but FNAME(fetch)
* and __direct_map would like to create a large PTE
@@ -3126,9 +3156,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->is_tdp && fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -3148,8 +3178,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
+ if (is_sigpending_pfn(pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
/*
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
@@ -3171,7 +3206,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
+ return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
if (unlikely(!fault->slot)) {
gva_t gva = fault->is_tdp ? 0 : fault->addr;
@@ -3422,7 +3457,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ sp = spte_to_child_sp(*root_hpa);
if (WARN_ON(!sp))
return;
@@ -3907,8 +3946,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->pae_root[i];
if (IS_VALID_PAE_ROOT(root)) {
- root &= SPTE_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
+ sp = spte_to_child_sp(root);
mmu_sync_children(vcpu, sp, true);
}
}
@@ -4169,7 +4207,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
&fault->hva);
if (!async)
@@ -4186,7 +4224,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
}
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
fault->write, &fault->map_writable,
&fault->hva);
return RET_PF_CONTINUE;
@@ -4262,14 +4305,14 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
- r = make_mmu_pages_available(vcpu);
- if (r)
- goto out_unlock;
-
- if (is_tdp_mmu_fault)
+ if (is_tdp_mmu_fault) {
r = kvm_tdp_mmu_map(vcpu, fault);
- else
+ } else {
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
r = __direct_map(vcpu, fault);
+ }
out_unlock:
if (is_tdp_mmu_fault)
@@ -5971,7 +6014,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
r = kvm_mmu_init_tdp_mmu(kvm);
@@ -6656,7 +6699,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
}
mutex_unlock(&kvm_lock);
}
@@ -6788,7 +6831,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
mutex_unlock(&kvm_lock);
}
@@ -6796,9 +6839,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
return err;
}
-static void kvm_recover_nx_lpages(struct kvm *kvm)
+static void kvm_recover_nx_huge_pages(struct kvm *kvm)
{
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+ struct kvm_memory_slot *slot;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -6819,24 +6863,55 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
- if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ if (list_empty(&kvm->arch.possible_nx_huge_pages))
break;
/*
* We use a separate list instead of just using active_mmu_pages
- * because the number of lpage_disallowed pages is expected to
- * be relatively small compared to the total.
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
*/
- sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
struct kvm_mmu_page,
- lpage_disallowed_link);
- WARN_ON_ONCE(!sp->lpage_disallowed);
- if (is_tdp_mmu_page(sp)) {
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ /*
+ * Unaccount and do not attempt to recover any NX Huge Pages
+ * that are being dirty tracked, as they would just be faulted
+ * back in as 4KiB pages. The NX Huge Pages in this slot will be
+ * recovered, along with all the other huge pages in the slot,
+ * when dirty logging is disabled.
+ *
+ * Since gfn_to_memslot() is relatively expensive, it helps to
+ * skip it if it the test cannot possibly return true. On the
+ * other hand, if any memslot has logging enabled, chances are
+ * good that all of them do, in which case unaccount_nx_huge_page()
+ * is much cheaper than zapping the page.
+ *
+ * If a memslot update is in progress, reading an incorrect value
+ * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+ * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+ * it is becoming nonzero, the page will be zapped unnecessarily.
+ * Either way, this only affects efficiency in racy situations,
+ * and not correctness.
+ */
+ slot = NULL;
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+
+ if (slot && kvm_slot_dirty_track_enabled(slot))
+ unaccount_nx_huge_page(kvm, sp);
+ else if (is_tdp_mmu_page(sp))
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
- } else {
+ else
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- WARN_ON_ONCE(sp->lpage_disallowed);
- }
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
@@ -6856,7 +6931,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_lpage_recovery_timeout(u64 start_time)
+static long get_nx_huge_page_recovery_timeout(u64 start_time)
{
bool enabled;
uint period;
@@ -6867,19 +6942,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time)
: MAX_SCHEDULE_TIMEOUT;
}
-static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
{
u64 start_time;
long remaining_time;
while (true) {
start_time = get_jiffies_64();
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && remaining_time > 0) {
schedule_timeout(remaining_time);
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -6888,7 +6963,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
if (kthread_should_stop())
return 0;
- kvm_recover_nx_lpages(kvm);
+ kvm_recover_nx_huge_pages(kvm);
}
}
@@ -6896,17 +6971,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
{
int err;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
"kvm-nx-lpage-recovery",
- &kvm->arch.nx_lpage_recovery_thread);
+ &kvm->arch.nx_huge_page_recovery_thread);
if (!err)
- kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+ kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
return err;
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
- if (kvm->arch.nx_lpage_recovery_thread)
- kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 582def531d4d..dbaf6755c5a7 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -57,7 +57,13 @@ struct kvm_mmu_page {
bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
/*
* The following two entries are used to key the shadow page in the
@@ -100,7 +106,14 @@ struct kvm_mmu_page {
};
};
- struct list_head lpage_disallowed_link;
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -120,18 +133,6 @@ struct kvm_mmu_page {
extern struct kmem_cache *mmu_page_header_cache;
-static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
-{
- return to_shadow_page(__pa(sptep));
-}
-
static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
{
return role.smm ? 1 : 0;
@@ -315,7 +316,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5ab5f94dcb6f..0f6455072055 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 2e08b2a45361..c0fd7e049b4e 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have an page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 7670c13ce251..1f03701b943a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
* should not modify the SPTE.
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
- * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
* vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
@@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 672f0432d777..771210ce5181 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
- INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
kvm->arch.tdp_mmu_zap_wq = wq;
return 1;
}
@@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
/* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
- WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -284,6 +283,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
gfn_t gfn, union kvm_mmu_page_role role)
{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
sp->role = role;
@@ -375,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
}
/**
@@ -395,14 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
- list_del(&sp->link);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp);
if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
@@ -1116,16 +1122,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @sp: The new TDP page table to install.
- * @account_nx: True if this page table is being installed to split a
- * non-executable huge page.
* @shared: This operation is running under the MMU lock in read mode.
*
* Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed).
*/
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_mmu_page *sp, bool account_nx,
- bool shared)
+ struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
int ret = 0;
@@ -1138,16 +1141,14 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
tdp_mmu_set_spte(kvm, iter, spte);
}
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
tdp_account_mmu_page(kvm, sp);
return 0;
}
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
@@ -1155,9 +1156,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- int ret;
+ int ret = RET_PF_RETRY;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -1166,6 +1168,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ int r;
+
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
@@ -1173,57 +1177,52 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
break;
/*
- * If there is an SPTE mapping a large page at a higher level
- * than the target, that SPTE must be cleared and replaced
- * with a non-leaf SPTE.
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
*/
- if (is_shadow_present_pte(iter.old_spte) &&
- is_large_pte(iter.old_spte)) {
- if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
- break;
+ if (is_removed_spte(iter.old_spte))
+ goto retry;
- /*
- * The iter must explicitly re-read the spte here
- * because the new value informs the !present
- * path below.
- */
- iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
- }
+ /* Step down into the lower level page table if it exists. */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ !is_large_pte(iter.old_spte))
+ continue;
- if (!is_shadow_present_pte(iter.old_spte)) {
- bool account_nx = fault->huge_page_disallowed &&
- fault->req_level >= iter.level;
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- /*
- * If SPTE has been frozen by another thread, just
- * give up and retry, avoiding unnecessary page table
- * allocation and free.
- */
- if (is_removed_spte(iter.old_spte))
- break;
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- sp = tdp_mmu_alloc_sp(vcpu);
- tdp_mmu_init_child_sp(sp, &iter);
+ if (is_shadow_present_pte(iter.old_spte))
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ else
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
- if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
- tdp_mmu_free_sp(sp);
- break;
- }
+ /*
+ * Also force the guest to retry the access if the upper level SPTEs
+ * aren't in place.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
}
- }
- /*
- * Force the guest to retry the access if the upper level SPTEs aren't
- * in place, or if the target leaf SPTE is frozen by another CPU.
- */
- if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
- rcu_read_unlock();
- return RET_PF_RETRY;
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ track_possible_nx_huge_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ }
}
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
- rcu_read_unlock();
+retry:
+ rcu_read_unlock();
return ret;
}
@@ -1472,6 +1471,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
return sp;
}
+/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
@@ -1479,8 +1479,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
const int level = iter->level;
int ret, i;
- tdp_mmu_init_child_sp(sp, iter);
-
/*
* No need for atomics when writing to sp->spt since the page table has
* not been linked in yet and thus is not reachable from any other CPU.
@@ -1496,7 +1494,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* correctness standpoint since the translation will be the same either
* way.
*/
- ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
if (ret)
goto out;
@@ -1556,6 +1554,8 @@ retry:
continue;
}
+ tdp_mmu_init_child_sp(sp, &iter);
+
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index c163f7cc23ca..d3714200b932 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,6 +5,8 @@
#include <linux/kvm_host.h>
+#include "spte.h"
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index de1fd7369736..684393c22105 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -101,10 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
bool skip_pmi = false;
- /* Ignore counters that have been reprogrammed already. */
- if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
- return;
-
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
if (!in_pmi) {
/*
@@ -122,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
} else {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
}
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
if (!pmc->intr || skip_pmi)
return;
@@ -147,12 +142,22 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ /*
+ * Ignore overflow events for counters that are scheduled to be
+ * reprogrammed, e.g. if a PMI for the previous event races with KVM's
+ * handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
__kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
-static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- u64 config, bool exclude_user,
- bool exclude_kernel, bool intr)
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
struct perf_event *event;
@@ -204,14 +209,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
PTR_ERR(event), pmc->idx);
- return;
+ return PTR_ERR(event);
}
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
- clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
pmc->intr = intr || pebs;
+ return 0;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -245,7 +250,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
perf_event_enable(pmc->perf_event);
pmc->is_paused = false;
- clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
}
@@ -293,7 +297,7 @@ out:
return allow_event;
}
-void reprogram_counter(struct kvm_pmc *pmc)
+static void reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
@@ -303,10 +307,13 @@ void reprogram_counter(struct kvm_pmc *pmc)
pmc_pause_counter(pmc);
if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
- return;
+ goto reprogram_complete;
if (!check_pmu_event_filter(pmc))
- return;
+ goto reprogram_complete;
+
+ if (pmc->counter < pmc->prev_counter)
+ __kvm_perf_overflow(pmc, false);
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
printk_once("kvm pmu: pin control bit is ignored\n");
@@ -324,18 +331,29 @@ void reprogram_counter(struct kvm_pmc *pmc)
}
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
- return;
+ goto reprogram_complete;
pmc_release_perf_event(pmc);
pmc->current_config = new_config;
- pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
- (eventsel & pmu->raw_event_mask),
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
+
+ /*
+ * If reprogramming fails, e.g. due to contention, leave the counter's
+ * regprogram bit set, i.e. opportunistically try again on the next PMU
+ * refresh. Don't make a new request as doing so can stall the guest
+ * if reprogramming repeatedly fails.
+ */
+ if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT))
+ return;
+
+reprogram_complete:
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->prev_counter = 0;
}
-EXPORT_SYMBOL_GPL(reprogram_counter);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
@@ -345,10 +363,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
- if (unlikely(!pmc || !pmc->perf_event)) {
+ if (unlikely(!pmc)) {
clear_bit(bit, pmu->reprogram_pmi);
continue;
}
+
reprogram_counter(pmc);
}
@@ -522,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
- u64 prev_count;
-
- prev_count = pmc->counter;
+ pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
-
- reprogram_counter(pmc);
- if (pmc->counter < prev_count)
- __kvm_perf_overflow(pmc, false);
+ kvm_pmu_request_counter_reprogam(pmc);
}
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
@@ -542,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
- u64 config = pmc->current_config;
+ u64 config;
if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
select_os = config & ARCH_PERFMON_EVENTSEL_OS;
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
select_os = config & 0x1;
select_user = config & 0x2;
}
@@ -577,6 +594,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
size_t size;
int r;
@@ -613,9 +632,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
mutex_unlock(&kvm->lock);
- synchronize_srcu_expedited(&kvm->srcu);
r = 0;
cleanup:
kfree(filter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 5cc5721f260b..85ff3c0588ba 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void)
KVM_PMC_MAX_FIXED);
}
-void reprogram_counter(struct kvm_pmc *pmc);
+static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc)
+{
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index a19d473d0184..042d0aca3c92 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -7,22 +7,41 @@
#include <asm/cpufeatures.h>
/*
- * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
- * be directly used by KVM. Note, these word values conflict with the kernel's
- * "bug" caps, but KVM doesn't use those.
+ * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+ * unknown to the kernel, but need to be directly used by KVM. Note, these
+ * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
*/
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
+ CPUID_7_1_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
};
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
struct cpuid_reg {
u32 function;
@@ -48,6 +67,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
[CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
};
/*
@@ -78,6 +98,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX1;
else if (x86_feature == X86_FEATURE_SGX2)
return KVM_X86_FEATURE_SGX2;
+ else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
+ return KVM_X86_FEATURE_SGX_EDECCSSA;
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000000..a9c1c2af8d94
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,649 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = (u32)val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = (u32)val;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = val;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ kvm_set_msr(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ return rsm_load_state_32(ctxt, &smram.smram32);
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000000..a1cf2ac5bd78
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ /* ssp and svm_* fields below are not implemented by KVM */
+ u64 ssp;
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000000..088f6429b24c
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
index 7d6d97968fb9..02f4784b5d44 100644
--- a/arch/x86/kvm/svm/hyperv.h
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -9,27 +9,37 @@
#include <asm/mshyperv.h>
#include "../hyperv.h"
+#include "svm.h"
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4c620999d230..bc9cd7086fa9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -25,6 +25,7 @@
#include "trace.h"
#include "mmu.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
@@ -149,8 +150,12 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
- /* We don't want to see VMMCALLs from a nested guest */
- vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
@@ -179,8 +184,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
*/
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
int i;
/*
@@ -194,7 +198,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc &&
kvm_hv_hypercall_enabled(&svm->vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
- (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
@@ -369,8 +373,8 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
/* Hyper-V extensions (Enlightened VMCB) */
if (kvm_hv_hypercall_enabled(vcpu)) {
to->clean = from->clean;
- memcpy(to->reserved_sw, from->reserved_sw,
- sizeof(struct hv_enlightenments));
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
}
}
@@ -474,6 +478,15 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && npt_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* TODO: optimize unconditional TLB flush/MMU sync. A partial list of
* things to fix before this can be conditional:
*
@@ -800,6 +813,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
if (kvm_vcpu_apicv_active(vcpu))
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
return 0;
}
@@ -822,6 +837,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
return 1;
}
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
vmcb12_gpa = svm->vmcb->save.rax;
ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
@@ -1091,6 +1113,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
}
@@ -1125,6 +1153,9 @@ void svm_free_nested(struct vcpu_svm *svm)
if (!svm->nested.initialized)
return;
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
@@ -1143,9 +1174,6 @@ void svm_free_nested(struct vcpu_svm *svm)
svm->nested.initialized = false;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void svm_leave_nested(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1377,6 +1405,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
if (block_nested_events)
return -EBUSY;
@@ -1385,6 +1414,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
+#endif
if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
if (block_nested_events)
@@ -1411,6 +1441,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
switch (exit_code) {
case SVM_EXIT_INTR:
@@ -1429,6 +1460,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
}
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
default:
break;
}
@@ -1479,7 +1517,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
- /* 'clean' and 'reserved_sw' are not changed by KVM */
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1709,6 +1747,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
return false;
}
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
return true;
}
@@ -1720,4 +1761,5 @@ struct kvm_x86_nested_ops svm_nested_ops = {
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 9d65cd095691..0e313fbae055 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -159,7 +159,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~pmu->reserved_bits;
if (data != pmc->eventsel) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
return 0;
}
@@ -212,7 +212,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index efaaef2b7ae1..86d6897f4806 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -465,9 +465,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
return;
for (i = 0; i < npages; i++) {
- page_virtual = kmap_atomic(pages[i]);
+ page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
- kunmap_atomic(page_virtual);
+ kunmap_local(page_virtual);
cond_resched();
}
}
@@ -2648,7 +2648,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
ghcb_scratch_beg = control->ghcb_gpa +
offsetof(struct ghcb, shared_buffer);
ghcb_scratch_end = control->ghcb_gpa +
- offsetof(struct ghcb, reserved_1);
+ offsetof(struct ghcb, reserved_0xff0);
/*
* If the scratch area begins within the GHCB, it must be
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4b6d2b050e57..9a194aa1a75a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -6,6 +6,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "pmu.h"
@@ -346,12 +347,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return 0;
}
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1438,6 +1433,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
*/
svm_clear_current_vmcb(svm->vmcb);
+ svm_leave_nested(vcpu);
svm_free_nested(svm);
sev_free_vcpu(vcpu);
@@ -2713,8 +2709,6 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
- case MSR_IA32_PERF_CAPABILITIES:
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -3425,15 +3419,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
@@ -3738,6 +3723,13 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
/*
+ * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+ * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+ * entries, and thus is a superset of Hyper-V's fine grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+
+ /*
* Flush only the current ASID even if the TLB flush was invoked via
* kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
* ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
@@ -3903,8 +3895,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ /*
+ * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+ * can't read guest memory (dereference memslots) to decode the WRMSR.
+ */
+ if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
+ nrips && control->next_rip)
return handle_fastpath_set_msr_irqoff(vcpu);
return EXIT_FASTPATH_NONE;
@@ -4116,6 +4114,8 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return false;
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/* SEV-ES guests do not support SMM, so report false */
if (kvm && sev_es_guest(kvm))
return false;
@@ -4372,6 +4372,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+#ifdef CONFIG_KVM_SMM
bool svm_smi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4399,7 +4400,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map_save;
@@ -4408,10 +4409,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
if (!is_guest_mode(vcpu))
return 0;
- /* FED8h - SVM Guest */
- put_smstate(u64, smstate, 0x7ed8, 1);
- /* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+ /*
+ * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
+ * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+ */
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 1;
+
+ smram->smram64.svm_guest_flag = 1;
+ smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4433,8 +4440,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
- &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
return 1;
BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
@@ -4446,34 +4452,33 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map, map_save;
- u64 saved_efer, vmcb12_gpa;
struct vmcb *vmcb12;
int ret;
+ const struct kvm_smram_state_64 *smram64 = &smram->smram64;
+
if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
return 0;
/* Non-zero if SMI arrived while vCPU was in guest mode. */
- if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ if (!smram64->svm_guest_flag)
return 0;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
return 1;
- saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
- if (!(saved_efer & EFER_SVME))
+ if (!(smram64->efer & EFER_SVME))
return 1;
- vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
return 1;
ret = 1;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
goto unmap_map;
if (svm_allocate_nested(svm))
@@ -4495,7 +4500,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
if (ret)
goto unmap_save;
@@ -4521,6 +4526,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
/* We must be in SMM; RSM will cause a vmexit anyway. */
}
}
+#endif
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
@@ -4796,10 +4802,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = svm_smi_allowed,
.enter_smm = svm_enter_smm,
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
+#endif
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
@@ -4865,6 +4873,7 @@ static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
+ kvm_caps.supported_perf_cap = 0;
kvm_caps.supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 199a2ecef1ce..4826e6cc611b 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -151,7 +151,10 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
struct svm_nested_state {
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 8cdc62c74a96..26a89d0da93e 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -14,9 +14,9 @@
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve;
+ struct hv_vmcb_enlightenments *hve;
struct hv_partition_assist_pg **p_hv_pa_pg =
&to_kvm_hv(vcpu->kvm)->hv_pa_pg;
@@ -26,13 +26,13 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
if (!*p_hv_pa_pg)
return -ENOMEM;
- hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw;
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
hve->partition_assist_page = __pa(*p_hv_pa_pg);
hve->hv_vm_id = (unsigned long)vcpu->kvm;
if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
hve->hv_enlightenments_control.nested_flush_hypercall = 1;
- vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
return 0;
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index e2fc59380465..45faf84476ce 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -13,12 +13,14 @@
static struct kvm_x86_ops svm_x86_ops;
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
@@ -51,8 +53,8 @@ static inline void svm_hv_hardware_setup(void)
vp_ap->nested_control.features.directhypercall = 1;
}
- svm_x86_ops.enable_direct_tlbflush =
- svm_hv_enable_direct_tlbflush;
+ svm_x86_ops.enable_l2_tlb_flush =
+ svm_hv_enable_l2_tlb_flush;
}
}
@@ -60,23 +62,20 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct kvm_vcpu *vcpu)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
if (hve->hv_enlightenments_control.msr_bitmap)
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
-static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
- struct kvm_vcpu *vcpu)
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
u32 vp_index = kvm_hv_get_vpindex(vcpu);
if (hve->hv_vp_id != vp_index) {
hve->hv_vp_id = vp_index;
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
}
#else
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 34367dc203f2..8e8295e774f0 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bc25589ad588..83843379813e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -113,12 +113,13 @@ TRACE_EVENT(kvm_hv_hypercall_done,
* Tracepoint for Xen hypercall.
*/
TRACE_EVENT(kvm_xen_hypercall,
- TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3, unsigned long a4,
- unsigned long a5),
- TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+ TP_PROTO(u8 cpl, unsigned long nr,
+ unsigned long a0, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4, unsigned long a5),
+ TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5),
TP_STRUCT__entry(
+ __field(u8, cpl)
__field(unsigned long, nr)
__field(unsigned long, a0)
__field(unsigned long, a1)
@@ -129,6 +130,7 @@ TRACE_EVENT(kvm_xen_hypercall,
),
TP_fast_assign(
+ __entry->cpl = cpl;
__entry->nr = nr;
__entry->a0 = a0;
__entry->a1 = a1;
@@ -138,8 +140,9 @@ TRACE_EVENT(kvm_xen_hypercall,
__entry->a4 = a5;
),
- TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
- __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->cpl, __entry->nr,
+ __entry->a0, __entry->a1, __entry->a2,
__entry->a3, __entry->a4, __entry->a5)
);
@@ -1547,38 +1550,41 @@ TRACE_EVENT(kvm_hv_timer_state,
* Tracepoint for kvm_hv_flush_tlb.
*/
TRACE_EVENT(kvm_hv_flush_tlb,
- TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
- TP_ARGS(processor_mask, address_space, flags),
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(processor_mask, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, processor_mask)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
__entry->processor_mask = processor_mask;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
- TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
__entry->processor_mask, __entry->address_space,
- __entry->flags)
+ __entry->flags, __entry->guest_mode ? "(L2)" : "")
);
/*
* Tracepoint for kvm_hv_flush_tlb_ex.
*/
TRACE_EVENT(kvm_hv_flush_tlb_ex,
- TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
- TP_ARGS(valid_bank_mask, format, address_space, flags),
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, valid_bank_mask)
__field(u64, format)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
@@ -1586,12 +1592,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
__entry->format = format;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
TP_printk("valid_bank_mask 0x%llx format 0x%llx "
- "address_space 0x%llx flags 0x%llx",
+ "address_space 0x%llx flags 0x%llx %s",
__entry->valid_bank_mask, __entry->format,
- __entry->address_space, __entry->flags)
+ __entry->address_space, __entry->flags,
+ __entry->guest_mode ? "(L2)" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 07254314f3dd..cd2ac9536c99 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void)
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
}
-static inline u64 vmx_get_perf_capabilities(void)
-{
- u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
- u64 host_perf_cap = 0;
-
- if (!enable_pmu)
- return 0;
-
- if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
-
- if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
-
- if (vmx_pebs_supported()) {
- perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
- }
-
- return perf_cap;
-}
-
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c
index d8b23c96d627..ae03d1fe0355 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -3,9 +3,9 @@
#include <linux/errno.h>
#include <linux/smp.h>
-#include "../hyperv.h"
#include "../cpuid.h"
-#include "evmcs.h"
+#include "hyperv.h"
+#include "nested.h"
#include "vmcs.h"
#include "vmx.h"
#include "trace.h"
@@ -322,24 +322,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
{
- struct hv_vp_assist_page assist_page;
-
- *evmcs_gpa = -1ull;
-
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
- return false;
-
- if (unlikely(!assist_page.enlighten_vmentry))
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
- return false;
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
- *evmcs_gpa = assist_page.current_nested_vmcs;
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
- return true;
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
}
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
@@ -507,3 +500,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return 0;
}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h
index 6f746ef3c038..571e7929d14e 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_EVMCS_H
-#define __KVM_X86_VMX_EVMCS_H
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
#include <linux/jump_label.h>
@@ -8,6 +8,8 @@
#include <asm/mshyperv.h>
#include <asm/vmx.h>
+#include "../hyperv.h"
+
#include "capabilities.h"
#include "vmcs.h"
#include "vmcs12.h"
@@ -235,11 +237,13 @@ enum nested_evmptrld_status {
EVMPTRLD_ERROR,
};
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
-#endif /* __KVM_X86_VMX_EVMCS_H */
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c62352dda6a..b6f4411b613e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,7 +7,6 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -16,6 +15,7 @@
#include "trace.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
@@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
}
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -1126,6 +1133,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && enable_ept)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
* for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
* full TLB flush from the guest's perspective. This is required even
@@ -1557,12 +1573,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
}
@@ -2563,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
nested_ept_init_mmu_context(vcpu);
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
- * bits which we consider mandatory enabled.
- * The CR0_READ_SHADOW is what L2 should have expected to read given
- * the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
- * have more bits than L1 expected.
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
@@ -3251,6 +3273,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
@@ -4767,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ /*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect
+ * branch predictors when transitioning from L2 to L1, as L1 expects
+ * hardware (KVM in this case) to provide separate predictor modes.
+ * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+ * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+ * separate modes for L2 vs L1.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ indirect_branch_prediction_barrier();
+
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -4854,6 +4893,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
{
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
}
@@ -5099,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
- * that have higher priority than VM-Exit (see Intel SDM's pseudocode
- * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
- * running the guest, i.e. KVM needs to check the _guest_ values.
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
*
- * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
- * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
- * Mode, but KVM will never take the guest out of those modes.
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
- !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
/*
- * CPL=0 and all other checks that are lower priority than VM-Exit must
- * be checked manually.
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
*/
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
@@ -5126,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon)
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
@@ -5205,7 +5267,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
- u64 evmcs_gpa;
int r;
if (!nested_vmx_check_permission(vcpu))
@@ -5231,7 +5292,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
if (likely(!guest_cpuid_has_evmcs(vcpu) ||
- !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
@@ -6128,6 +6189,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
* Handle L2's bus locks in L0 directly.
*/
return true;
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
default:
break;
}
@@ -6440,9 +6506,6 @@ out:
return kvm_state.size;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void vmx_leave_nested(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -6982,4 +7045,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6312c9541c3c..96952263b029 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
}
/*
- * Return the cr0 value that a nested guest would read. This is a combination
- * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
- * its hypervisor (cr0_read_shadow).
+ * Return the cr0/4 value that a nested guest would read. This is a combination
+ * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed
+ * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as
+ * the value+mask loaded into vmcs02 may not match the vmcs12 fields.
*/
static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
{
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 10b33da9bd05..e5cec07ca8d9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
pmc = intel_pmc_idx_to_pmc(pmu, bit);
if (pmc)
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
if (!(data & reserved_bits)) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
return 0;
}
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
@@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].current_config = 0;
}
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
@@ -647,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = 0;
+ pmc->counter = pmc->prev_counter = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 8f95c7c01433..b12da2a6dec9 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
/* Enforce CPUID restriction on max enclave size. */
max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
sgx_12_0->edx;
- if (size >= BIT_ULL(max_size_log2))
+ if (size >= BIT_ULL(max_size_log2)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
/*
* sgx_virt_ecreate() returns:
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 746129ddd5ae..01936013428b 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -208,9 +208,8 @@ struct __packed vmcs12 {
/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
-#define CHECK_OFFSET(field, loc) \
- BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
- "Offset of " #field " in struct vmcs12 has changed.")
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
static inline void vmx_check_vmcs12_offsets(void)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 0b5db4de4d09..766c6b3ef5ed 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run)
.section .text, "ax"
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
@@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
+#endif
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..fe5615fd8295 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -51,7 +51,6 @@
#include "capabilities.h"
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
@@ -66,6 +65,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -526,7 +526,7 @@ static unsigned long host_idt_base;
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
@@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
* to change it directly without causing a vmexit. In that case read
* it after vmexit and store it in vmx->spec_ctrl.
*/
- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
return flags;
@@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
/*
* No indirect branch prediction barrier needed when switching
- * the active VMCS within a guest, e.g. on nested VM-Enter.
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ * the active VMCS within a vCPU, unless IBRS is advertised to
+ * the vCPU. To minimize the number of IBPBs executed, KVM
+ * performs IBPB on nested VM-Exit (a single nested transition
+ * may switch the active VMCS multiple times).
*/
if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
indirect_branch_prediction_barrier();
@@ -1834,12 +1836,42 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
- uint64_t val)
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
{
- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ uint64_t valid_bits;
+
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
- return !(val & ~valid_bits);
+ return !(msr->data & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
@@ -1849,9 +1881,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
- case MSR_IA32_PERF_CAPABILITIES:
- msr->data = vmx_get_perf_capabilities();
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -2029,7 +2058,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+ if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
@@ -2241,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEAT_CTL:
- if (!vmx_feature_control_msr_valid(vcpu, data) ||
- (to_vmx(vcpu)->msr_ia32_feature_control &
- FEAT_CTL_LOCKED && !msr_info->host_initiated))
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
return 1;
+
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
@@ -2342,14 +2370,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
- (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
}
if (data & PERF_CAP_PEBS_FORMAT) {
if ((data & PERF_CAP_PEBS_MASK) !=
- (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
return 1;
@@ -6844,6 +6872,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
@@ -7669,6 +7699,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
+static u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
static __init void vmx_set_cpu_caps(void)
{
kvm_set_cpu_caps();
@@ -7691,6 +7746,7 @@ static __init void vmx_set_cpu_caps(void)
if (!enable_pmu)
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7906,6 +7962,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEAT_CTL_LMCE_ENABLED;
}
+#ifdef CONFIG_KVM_SMM
static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
@@ -7914,7 +7971,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7935,7 +7992,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7960,6 +8017,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
+#endif
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
@@ -8127,10 +8185,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = vmx_smi_allowed,
.enter_smm = vmx_enter_smm,
.leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
+#endif
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -8490,8 +8550,8 @@ static int __init vmx_init(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
} else {
enlightened_vmcs = false;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ec268df83ed6..842dc898c972 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -6,19 +6,33 @@
#include <asm/vmx.h>
-#include "evmcs.h"
+#include "hyperv.h"
#include "vmcs.h"
#include "../x86.h"
void vmread_error(unsigned long field, bool fault);
-__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
- bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+/*
+ * The VMREAD error trampoline _always_ uses the stack to pass parameters, even
+ * for 64-bit targets. Preserving all registers allows the VMREAD inline asm
+ * blob to avoid clobbering GPRs, which in turn allows the compiler to better
+ * optimize sequences of VMREADs.
+ *
+ * Declare the trampoline as an opaque label as it's not safe to call from C
+ * code; there is no way to tell the compiler to pass params on the stack for
+ * 64-bit targets.
+ *
+ * void vmread_error_trampoline(unsigned long field, bool fault);
+ */
+extern unsigned long vmread_error_trampoline;
+#endif
+
static __always_inline void vmcs_check16(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 490ec23c8450..312aea1854ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -30,6 +30,7 @@
#include "hyperv.h"
#include "lapic.h"
#include "xen.h"
+#include "smm.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -119,8 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
-static void process_smi(struct kvm_vcpu *vcpu);
-static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
@@ -464,7 +463,6 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
return vcpu->arch.apic_base;
}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
{
@@ -492,7 +490,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
/*
* Handle a fault on a hardware virtualization (VMX or SVM) instruction.
@@ -628,6 +625,12 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -777,7 +780,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
fault->address);
}
-EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
@@ -806,7 +808,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu)
atomic_inc(&vcpu->arch.nmi_queued);
kvm_make_request(KVM_REQ_NMI, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
@@ -831,7 +832,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
}
-EXPORT_SYMBOL_GPL(kvm_require_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
@@ -1648,6 +1648,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
case MSR_IA32_ARCH_CAPABILITIES:
msr->data = kvm_get_arch_capabilities();
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = kvm_caps.supported_perf_cap;
+ break;
case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
@@ -2061,7 +2064,6 @@ int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
{
return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
int kvm_emulate_invd(struct kvm_vcpu *vcpu)
{
@@ -2309,13 +2311,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
- if (system_time & 1) {
- kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
- KVM_HOST_USES_PFN, system_time & ~1ULL,
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
sizeof(struct pvclock_vcpu_time_info));
- } else {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
- }
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
return;
}
@@ -2507,7 +2507,6 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio)
return _tsc;
}
-EXPORT_SYMBOL_GPL(kvm_scale_tsc);
static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
@@ -2966,6 +2965,22 @@ static void kvm_update_masterclock(struct kvm *kvm)
kvm_end_pvclock_update(kvm);
}
+/*
+ * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
+ * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
+ * can change during boot even if the TSC is constant, as it's possible for KVM
+ * to be loaded before TSC calibration completes. Ideally, KVM would get a
+ * notification when calibration completes, but practically speaking calibration
+ * will complete before userspace is alive enough to create VMs.
+ */
+static unsigned long get_cpu_tsc_khz(void)
+{
+ if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
@@ -2976,7 +2991,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
get_cpu();
data->flags = 0;
- if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+ if (ka->use_master_clock &&
+ (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
#ifdef CONFIG_X86_64
struct timespec64 ts;
@@ -2990,7 +3006,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
data->flags |= KVM_CLOCK_TSC_STABLE;
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+ kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
@@ -3029,12 +3045,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
unsigned long flags;
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock))) {
+ while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock)))
+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
return;
read_lock_irqsave(&gpc->lock, flags);
@@ -3100,7 +3114,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ tgt_tsc_khz = get_cpu_tsc_khz();
if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -3383,7 +3397,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
@@ -3391,6 +3405,9 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
static_call(kvm_x86_flush_tlb_all)(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3409,6 +3426,12 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
}
static_call(kvm_x86_flush_tlb_guest)(vcpu);
+
+ /*
+ * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+ * grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
}
@@ -3560,20 +3583,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.arch_capabilities = data;
break;
- case MSR_IA32_PERF_CAPABILITIES: {
- struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
-
+ case MSR_IA32_PERF_CAPABILITIES:
if (!msr_info->host_initiated)
return 1;
- if (kvm_get_msr_feature(&msr_ent))
- return 1;
- if (data & ~msr_ent.data)
+ if (data & ~kvm_caps.supported_perf_cap)
return 1;
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
return 0;
- }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -3645,7 +3663,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
}
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
@@ -4061,7 +4079,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
msr_info->data = vcpu->arch.smbase;
break;
@@ -4419,7 +4437,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
if (sched_info_on())
- r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
+ r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
+ KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
break;
#endif
case KVM_CAP_SYNC_REGS:
@@ -4435,6 +4454,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
case KVM_CAP_X86_SMM:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ break;
+
/* SMBASE is usually relocated above 1M on modern chipsets,
* and SMM handlers might indeed rely on 4G segment limits,
* so do not report SMM to be available if real mode is
@@ -4475,7 +4497,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- r = kvm_x86_ops.enable_direct_tlbflush != NULL;
+ r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
@@ -4891,13 +4913,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_SMI, vcpu);
-
- return 0;
-}
-
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -5033,8 +5048,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
process_nmi(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
/*
* KVM's ABI only allows for one exception to be migrated. Luckily,
@@ -5062,16 +5079,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
ex->pending && ex->has_payload)
kvm_deliver_exception_payload(vcpu, ex);
+ memset(events, 0, sizeof(*events));
+
/*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
* again.
*/
- if (kvm_exception_is_soft(ex->vector)) {
- events->exception.injected = 0;
- events->exception.pending = 0;
- } else {
+ if (!kvm_exception_is_soft(ex->vector)) {
events->exception.injected = ex->injected;
events->exception.pending = ex->pending;
/*
@@ -5091,20 +5107,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
- events->nmi.pad = 0;
- events->sipi_vector = 0; /* never valid when reporting to user space */
+ /* events->sipi_vector is never valid when reporting to user space */
+#ifdef CONFIG_KVM_SMM
events->smi.smm = is_smm(vcpu);
events->smi.pending = vcpu->arch.smi_pending;
events->smi.smm_inside_nmi =
!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+#endif
events->smi.latched_init = kvm_lapic_latched_init(vcpu);
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
@@ -5116,12 +5132,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
}
-
- memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
-
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
@@ -5194,8 +5206,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+#ifdef CONFIG_KVM_SMM
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
+ kvm_leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
}
@@ -5208,6 +5221,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
}
+#else
+ if (events->smi.smm || events->smi.pending ||
+ events->smi.smm_inside_nmi)
+ return -EINVAL;
+#endif
+
if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
@@ -5491,10 +5510,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
}
return r;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- if (!kvm_x86_ops.enable_direct_tlbflush)
+ if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5574,7 +5593,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SMI: {
- r = kvm_vcpu_ioctl_smi(vcpu);
+ r = kvm_inject_smi(vcpu);
break;
}
case KVM_SET_CPUID: {
@@ -6233,9 +6252,7 @@ split_irqchip_unlock:
break;
case KVM_CAP_X86_USER_SPACE_MSR:
r = -EINVAL;
- if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
- KVM_MSR_EXIT_REASON_UNKNOWN |
- KVM_MSR_EXIT_REASON_FILTER))
+ if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
break;
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
@@ -6412,7 +6429,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (!user_range->nmsrs)
return 0;
- if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
return -EINVAL;
if (!user_range->flags)
@@ -6446,7 +6463,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
int r = 0;
u32 i;
- if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
@@ -7119,8 +7136,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return handled;
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
+void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
{
static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
@@ -7156,16 +7173,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- access |= PFERR_FETCH_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
-}
-
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
@@ -7278,15 +7285,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
}
-static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
- unsigned long addr, void *val, unsigned int bytes)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
-
- return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
-}
-
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
@@ -8078,26 +8076,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
-{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
-}
-
-static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- return vcpu->arch.smbase;
-}
-
-static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- vcpu->arch.smbase = smbase;
-}
-
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
@@ -8172,18 +8150,13 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
return emul_to_vcpu(ctxt)->arch.hflags;
}
-static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
+#ifndef CONFIG_KVM_SMM
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- kvm_smm_changed(vcpu, false);
-}
-
-static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ WARN_ON_ONCE(1);
+ return X86EMUL_UNHANDLEABLE;
}
+#endif
static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
@@ -8209,7 +8182,6 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = emulator_read_std,
.write_std = emulator_write_std,
- .read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -8229,11 +8201,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
- .get_smbase = emulator_get_smbase,
- .set_smbase = emulator_set_smbase,
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
- .set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
@@ -8248,7 +8217,6 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
- .exiting_smm = emulator_exiting_smm,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
@@ -8321,8 +8289,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
- BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
- BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
ctxt->interruptibility = 0;
ctxt->have_exception = false;
@@ -8581,29 +8547,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
-{
- trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
-
- if (entering_smm) {
- vcpu->arch.hflags |= HF_SMM_MASK;
- } else {
- vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
-
- /* Process a latched INIT or SMI, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- /*
- * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
- * on SMM exit we still need to reload them from
- * guest memory
- */
- vcpu->arch.pdptrs_from_userspace = false;
- }
-
- kvm_mmu_reset_context(vcpu);
-}
-
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -8835,7 +8778,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
write_fault_to_spt,
emulation_type))
return 1;
- if (ctxt->have_exception) {
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
/*
* #UD should result in just EMULATION_FAILED, and trap-like
* exception should not be encountered during decode.
@@ -9099,9 +9044,11 @@ static void tsc_khz_changed(void *data)
struct cpufreq_freqs *freq = data;
unsigned long khz = 0;
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
+
if (data)
khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ else
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
@@ -9122,8 +9069,10 @@ static void kvm_hyperv_tsc_notifier(void)
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
- for_each_present_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
kvm_caps.max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -9260,10 +9209,10 @@ static void kvm_timer_init(void)
}
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- }
- cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
- kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ }
}
#ifdef CONFIG_X86_64
@@ -9423,10 +9372,11 @@ void kvm_arch_exit(void)
#endif
kvm_lapic_exit();
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ }
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
irq_work_sync(&pvclock_irq_work);
@@ -9805,7 +9755,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
{
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
kvm_x86_ops.nested_ops->triple_fault(vcpu);
return 1;
}
@@ -9993,6 +9943,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
* in order to make progress and get back here for another iteration.
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
@@ -10005,6 +9956,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
} else
static_call(kvm_x86_enable_smi_window)(vcpu);
}
+#endif
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
@@ -10080,246 +10032,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
-{
- u32 flags = 0;
- flags |= seg->g << 23;
- flags |= seg->db << 22;
- flags |= seg->l << 21;
- flags |= seg->avl << 20;
- flags |= seg->present << 15;
- flags |= seg->dpl << 13;
- flags |= seg->s << 12;
- flags |= seg->type << 8;
- return flags;
-}
-
-static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
-
- kvm_get_segment(vcpu, &seg, n);
- put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
- else
- offset = 0x7f2c + (n - 3) * 12;
-
- put_smstate(u32, buf, offset + 8, seg.base);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
- u16 flags;
-
- kvm_get_segment(vcpu, &seg, n);
- offset = 0x7e00 + n * 16;
-
- flags = enter_smm_get_segment_flags(&seg) >> 8;
- put_smstate(u16, buf, offset, seg.selector);
- put_smstate(u16, buf, offset + 2, flags);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u64, buf, offset + 8, seg.base);
-}
-#endif
-
-static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
- put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
- put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
- put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
-
- for (i = 0; i < 8; i++)
- put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u32, buf, 0x7fcc, (u32)val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u32, buf, 0x7fc8, (u32)val);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u32, buf, 0x7fc4, seg.selector);
- put_smstate(u32, buf, 0x7f64, seg.base);
- put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u32, buf, 0x7fc0, seg.selector);
- put_smstate(u32, buf, 0x7f80, seg.base);
- put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f74, dt.address);
- put_smstate(u32, buf, 0x7f70, dt.size);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f58, dt.address);
- put_smstate(u32, buf, 0x7f54, dt.size);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_32(vcpu, buf, i);
-
- put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020000);
- put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- for (i = 0; i < 16; i++)
- put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
-
- put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
- put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u64, buf, 0x7f68, val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u64, buf, 0x7f60, val);
-
- put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
- put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
- put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
-
- put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020064);
-
- put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e94, seg.limit);
- put_smstate(u64, buf, 0x7e98, seg.base);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e84, dt.size);
- put_smstate(u64, buf, 0x7e88, dt.address);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e74, seg.limit);
- put_smstate(u64, buf, 0x7e78, seg.base);
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e64, dt.size);
- put_smstate(u64, buf, 0x7e68, dt.address);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_64(vcpu, buf, i);
-}
-#endif
-
-static void enter_smm(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs, ds;
- struct desc_ptr dt;
- unsigned long cr0;
- char buf[512];
-
- memset(buf, 0, 512);
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- enter_smm_save_state_64(vcpu, buf);
- else
-#endif
- enter_smm_save_state_32(vcpu, buf);
-
- /*
- * Give enter_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. leave guest mode) after we've saved the state into the
- * SMM state-save area.
- */
- static_call(kvm_x86_enter_smm)(vcpu, buf);
-
- kvm_smm_changed(vcpu, true);
- kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
-
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0x8000);
-
- cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
-
- static_call(kvm_x86_set_cr4)(vcpu, 0);
-
- /* Undocumented: IDT limit is set to zero on entry to SMM. */
- dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
-
- kvm_set_dr(vcpu, 7, DR7_FIXED_1);
-
- cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
- cs.base = vcpu->arch.smbase;
-
- ds.selector = 0;
- ds.base = 0;
-
- cs.limit = ds.limit = 0xffffffff;
- cs.type = ds.type = 0x3;
- cs.dpl = ds.dpl = 0;
- cs.db = ds.db = 0;
- cs.s = ds.s = 1;
- cs.l = ds.l = 0;
- cs.g = ds.g = 1;
- cs.avl = ds.avl = 0;
- cs.present = ds.present = 1;
- cs.unusable = ds.unusable = 0;
- cs.padding = ds.padding = 0;
-
- kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
-
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- static_call(kvm_x86_set_efer)(vcpu, 0);
-#endif
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_mmu_reset_context(vcpu);
-}
-
-static void process_smi(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.smi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
@@ -10510,20 +10222,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- /* Forbid vmenter if vcpu dirty ring is soft-full */
- if (unlikely(vcpu->kvm->dirty_ring_size &&
- kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
- vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
- trace_kvm_dirty_ring_exit(vcpu);
- r = 0;
- goto out;
- }
-
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
r = -EIO;
goto out;
}
+
+ if (kvm_dirty_ring_check_request(vcpu)) {
+ r = 0;
+ goto out;
+ }
+
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
@@ -10547,23 +10256,37 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
kvm_mmu_load_pgd(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+
+ /*
+ * Note, the order matters here, as flushing "all" TLB entries
+ * also flushes the "current" TLB entries, i.e. servicing the
+ * flush "all" will clear any request to flush "current".
+ */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb_all(vcpu);
- /* Flushing all ASIDs flushes the current ASID... */
- kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
kvm_service_local_tlb_flush_requests(vcpu);
+ /*
+ * Fall back to a "full" guest flush if Hyper-V's precise
+ * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
+ * the flushes are considered "remote" and not "local" because
+ * the requests can be initiated from other vCPUs.
+ */
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+ kvm_hv_vcpu_flush_tlb(vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- if (is_guest_mode(vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (is_guest_mode(vcpu))
kvm_x86_ops.nested_ops->triple_fault(vcpu);
- } else {
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
vcpu->mmio_needed = 0;
r = 0;
@@ -10578,8 +10301,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -11827,7 +11552,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -11893,6 +11618,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
kvm_async_pf_hash_reset(vcpu);
+
+ vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
kvm_pmu_init(vcpu);
vcpu->arch.pending_external_vector = -1;
@@ -11997,8 +11724,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
WARN_ON_ONCE(!init_event &&
(old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+ /*
+ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
+ * possible to INIT the vCPU while L2 is active. Force the vCPU back
+ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
+ * bits), i.e. virtualization is disabled.
+ */
+ if (is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+
kvm_lapic_reset(vcpu, init_event);
+ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
vcpu->arch.hflags = 0;
vcpu->arch.smi_pending = 0;
@@ -12317,7 +12054,6 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
@@ -12892,10 +12628,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
+#ifdef CONFIG_KVM_SMM
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
+#endif
if (kvm_arch_interrupt_allowed(vcpu) &&
(kvm_cpu_has_interrupt(vcpu) ||
@@ -12936,7 +12674,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 829d3134c1eb..9de72586f406 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -27,6 +27,7 @@ struct kvm_caps {
u64 supported_mce_cap;
u64 supported_xcr0;
u64 supported_xss;
+ u64 supported_perf_cap;
};
void kvm_spurious_fault(void);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2dae413bd62a..d7af40240248 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,13 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int idx = srcu_read_lock(&kvm->srcu);
if (gfn == GPA_INVALID) {
- kvm_gpc_deactivate(kvm, gpc);
+ kvm_gpc_deactivate(gpc);
goto out;
}
do {
- ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
- PAGE_SIZE);
+ ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
if (ret)
goto out;
@@ -170,112 +169,45 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
vcpu->arch.xen.timer.function = xen_timer_callback;
}
-static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- u64 now = get_kvmclock_ns(v->kvm);
- u64 delta_ns = now - vx->runstate_entry_time;
- u64 run_delay = current->sched_info.run_delay;
-
- if (unlikely(!vx->runstate_entry_time))
- vx->current_runstate = RUNSTATE_offline;
-
- /*
- * Time waiting for the scheduler isn't "stolen" if the
- * vCPU wasn't running anyway.
- */
- if (vx->current_runstate == RUNSTATE_running) {
- u64 steal_ns = run_delay - vx->last_steal;
-
- delta_ns -= steal_ns;
-
- vx->runstate_times[RUNSTATE_runnable] += steal_ns;
- }
- vx->last_steal = run_delay;
-
- vx->runstate_times[vx->current_runstate] += delta_ns;
- vx->current_runstate = state;
- vx->runstate_entry_time = now;
-}
-
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
-{
- struct kvm_vcpu_xen *vx = &v->arch.xen;
- struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
- uint64_t *user_times;
+ struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+ struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
+ size_t user_len, user_len1, user_len2;
+ struct vcpu_runstate_info rs;
unsigned long flags;
- size_t user_len;
- int *user_state;
-
- kvm_xen_update_runstate(v, state);
-
- if (!vx->runstate_cache.active)
- return;
-
- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_len = sizeof(struct vcpu_runstate_info);
- else
- user_len = sizeof(struct compat_vcpu_runstate_info);
-
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- user_len)) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- /* When invoked from kvm_sched_out() we cannot sleep */
- if (state == RUNSTATE_runnable)
- return;
-
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ size_t times_ofs;
+ uint8_t *update_bit = NULL;
+ uint64_t entry_time;
+ uint64_t *rs_times;
+ int *rs_state;
/*
* The only difference between 32-bit and 64-bit versions of the
- * runstate struct us the alignment of uint64_t in 32-bit, which
+ * runstate struct is the alignment of uint64_t in 32-bit, which
* means that the 64-bit version has an additional 4 bytes of
- * padding after the first field 'state'.
- *
- * So we use 'int __user *user_state' to point to the state field,
- * and 'uint64_t __user *user_times' for runstate_entry_time. So
- * the actual array of time[] in each state starts at user_times[1].
+ * padding after the first field 'state'. Let's be really really
+ * paranoid about that, and matching it with our internal data
+ * structures that we memcpy into it...
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
#ifdef CONFIG_X86_64
+ /*
+ * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
+ * so each subsequent field is shifted by 4, and it's 4 bytes longer.
+ */
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
+ BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
#endif
-
- user_state = gpc->khva;
-
- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
- state_entry_time);
- else
- user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
- state_entry_time);
-
/*
- * First write the updated state_entry_time at the appropriate
- * location determined by 'offset'.
- */
- BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
- BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
-
- user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
- smp_wmb();
-
- /*
- * Next, write the new runstate. This is in the *same* place
- * for 32-bit and 64-bit guests, asserted here for paranoia.
+ * The state field is in the same place at the start of both structs,
+ * and is the same size (int) as vx->current_runstate.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
offsetof(struct compat_vcpu_runstate_info, state));
@@ -284,34 +216,238 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- *user_state = vx->current_runstate;
+ /*
+ * The state_entry_time field is 64 bits in both versions, and the
+ * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
+ * is little-endian means that it's in the last *byte* of the word.
+ * That detail is important later.
+ */
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
/*
- * Write the actual runstate times immediately after the
- * runstate_entry_time.
+ * The time array is four 64-bit quantities in both versions, matching
+ * the vx->runstate_times and immediately following state_entry_time.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
- offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+ offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
- offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+ offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof_field(struct compat_vcpu_runstate_info, time));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
- smp_wmb();
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ user_len = sizeof(struct vcpu_runstate_info);
+ times_ofs = offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ } else {
+ user_len = sizeof(struct compat_vcpu_runstate_info);
+ times_ofs = offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+ }
+
+ /*
+ * There are basically no alignment constraints. The guest can set it
+ * up so it crosses from one page to the next, and at arbitrary byte
+ * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+ * anyway, even if the overall struct had been 64-bit aligned).
+ */
+ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+ user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+ user_len2 = user_len - user_len1;
+ } else {
+ user_len1 = user_len;
+ user_len2 = 0;
+ }
+ BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+ /*
+ * Attempt to obtain the GPC lock on *both* (if there are two)
+ * gfn_to_pfn caches that cover the region.
+ */
+ read_lock_irqsave(&gpc1->lock, flags);
+ while (!kvm_gpc_check(gpc1, user_len1)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ if (kvm_gpc_refresh(gpc1, user_len1))
+ return;
+
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+
+ if (likely(!user_len2)) {
+ /*
+ * Set up three pointers directly to the runstate_info
+ * struct in the guest (via the GPC).
+ *
+ * • @rs_state → state field
+ * • @rs_times → state_entry_time field.
+ * • @update_bit → last byte of state_entry_time, which
+ * contains the XEN_RUNSTATE_UPDATE bit.
+ */
+ rs_state = gpc1->khva;
+ rs_times = gpc1->khva + times_ofs;
+ if (v->kvm->arch.xen.runstate_update_flag)
+ update_bit = ((void *)(&rs_times[1])) - 1;
+ } else {
+ /*
+ * The guest's runstate_info is split across two pages and we
+ * need to hold and validate both GPCs simultaneously. We can
+ * declare a lock ordering GPC1 > GPC2 because nothing else
+ * takes them more than one at a time.
+ */
+ read_lock(&gpc2->lock);
+
+ if (!kvm_gpc_check(gpc2, user_len2)) {
+ read_unlock(&gpc2->lock);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ /*
+ * Use kvm_gpc_activate() here because if the runstate
+ * area was configured in 32-bit mode and only extends
+ * to the second page now because the guest changed to
+ * 64-bit mode, the second GPC won't have been set up.
+ */
+ if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
+ user_len2))
+ return;
+
+ /*
+ * We dropped the lock on GPC1 so we have to go all the
+ * way back and revalidate that too.
+ */
+ goto retry;
+ }
+
+ /*
+ * In this case, the runstate_info struct will be assembled on
+ * the kernel stack (compat or not as appropriate) and will
+ * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
+ * rs pointers accordingly.
+ */
+ rs_times = &rs.state_entry_time;
+
+ /*
+ * The rs_state pointer points to the start of what we'll
+ * copy to the guest, which in the case of a compat guest
+ * is the 32-bit field that the compiler thinks is padding.
+ */
+ rs_state = ((void *)rs_times) - times_ofs;
+
+ /*
+ * The update_bit is still directly in the guest memory,
+ * via one GPC or the other.
+ */
+ if (v->kvm->arch.xen.runstate_update_flag) {
+ if (user_len1 >= times_ofs + sizeof(uint64_t))
+ update_bit = gpc1->khva + times_ofs +
+ sizeof(uint64_t) - 1;
+ else
+ update_bit = gpc2->khva + times_ofs +
+ sizeof(uint64_t) - 1 - user_len1;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * Don't leak kernel memory through the padding in the 64-bit
+ * version of the struct.
+ */
+ memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
+#endif
+ }
+
+ /*
+ * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
+ * state_entry_time field, directly in the guest. We need to set
+ * that (and write-barrier) before writing to the rest of the
+ * structure, and clear it last. Just as Xen does, we address the
+ * single *byte* in which it resides because it might be in a
+ * different cache line to the rest of the 64-bit word, due to
+ * the (lack of) alignment constraints.
+ */
+ entry_time = vx->runstate_entry_time;
+ if (update_bit) {
+ entry_time |= XEN_RUNSTATE_UPDATE;
+ *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
+ smp_wmb();
+ }
/*
- * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
- * runstate_entry_time field.
+ * Now assemble the actual structure, either on our kernel stack
+ * or directly in the guest according to how the rs_state and
+ * rs_times pointers were set up above.
*/
- user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+ *rs_state = vx->current_runstate;
+ rs_times[0] = entry_time;
+ memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+ /* For the split case, we have to then copy it to the guest. */
+ if (user_len2) {
+ memcpy(gpc1->khva, rs_state, user_len1);
+ memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
+ }
smp_wmb();
- read_unlock_irqrestore(&gpc->lock, flags);
+ /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
+ if (update_bit) {
+ entry_time &= ~XEN_RUNSTATE_UPDATE;
+ *update_bit = entry_time >> 56;
+ smp_wmb();
+ }
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ read_unlock(&gpc2->lock);
+
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+}
+
+void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ u64 now = get_kvmclock_ns(v->kvm);
+ u64 delta_ns = now - vx->runstate_entry_time;
+ u64 run_delay = current->sched_info.run_delay;
+
+ if (unlikely(!vx->runstate_entry_time))
+ vx->current_runstate = RUNSTATE_offline;
+
+ /*
+ * Time waiting for the scheduler isn't "stolen" if the
+ * vCPU wasn't running anyway.
+ */
+ if (vx->current_runstate == RUNSTATE_running) {
+ u64 steal_ns = run_delay - vx->last_steal;
+
+ delta_ns -= steal_ns;
+
+ vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+ }
+ vx->last_steal = run_delay;
+
+ vx->runstate_times[vx->current_runstate] += delta_ns;
+ vx->current_runstate = state;
+ vx->runstate_entry_time = now;
+
+ if (vx->runstate_cache.active)
+ kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -352,12 +488,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info)))
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
return;
read_lock_irqsave(&gpc->lock, flags);
@@ -417,8 +551,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
/*
@@ -432,8 +565,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -493,6 +625,17 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ mutex_lock(&kvm->lock);
+ kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
+ mutex_unlock(&kvm->lock);
+ r = 0;
+ break;
+
default:
break;
}
@@ -530,6 +673,15 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
+ r = 0;
+ break;
+
default:
break;
}
@@ -554,15 +706,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));
if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache, NULL,
- KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_info));
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -570,37 +720,65 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
- case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+ size_t sz, sz1, sz2;
+
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
r = 0;
+ deactivate_out:
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
break;
}
- r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_runstate_info));
- break;
+ /*
+ * If the guest switches to 64-bit mode after setting the runstate
+ * address, that's actually OK. kvm_xen_update_runstate_guest()
+ * will cope.
+ */
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+ sz = sizeof(struct vcpu_runstate_info);
+ else
+ sz = sizeof(struct compat_vcpu_runstate_info);
+
+ /* How much fits in the (first) page? */
+ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
+ data->u.gpa, sz1);
+ if (r)
+ goto deactivate_out;
+
+ /* Either map the second page, or deactivate the second GPC */
+ if (sz1 >= sz) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ } else {
+ sz2 = sz - sz1;
+ BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
+ data->u.gpa + sz1, sz2);
+ if (r)
+ goto deactivate_out;
+ }
+ kvm_xen_update_runstate_guest(vcpu, false);
+ break;
+ }
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
@@ -693,6 +871,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
if (data->u.runstate.state <= RUNSTATE_offline)
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ else if (vcpu->arch.xen.runstate_cache.active)
+ kvm_xen_update_runstate_guest(vcpu, false);
r = 0;
break;
@@ -954,6 +1134,14 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
evtchn_port_t *ports)
{
@@ -964,9 +1152,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
bool ret = true;
int idx, i;
- read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
ret = false;
@@ -986,8 +1174,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
}
out_rcu:
- srcu_read_unlock(&kvm->srcu, idx);
read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
@@ -1000,20 +1188,45 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
evtchn_port_t port, *ports;
gpa_t gpa;
- if (!longmode || !lapic_in_kernel(vcpu) ||
+ if (!lapic_in_kernel(vcpu) ||
!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
return false;
idx = srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
- sizeof(sched_poll))) {
+ if (!gpa) {
*r = -EFAULT;
return true;
}
+ if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
+ struct compat_sched_poll sp32;
+
+ /* Sanity check that the compat struct definition is correct */
+ BUILD_BUG_ON(sizeof(sp32) != 16);
+
+ if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * This is a 32-bit pointer to an array of evtchn_port_t which
+ * are uint32_t, so once it's converted no further compat
+ * handling is needed.
+ */
+ sched_poll.ports = (void *)(unsigned long)(sp32.ports);
+ sched_poll.nr_ports = sp32.nr_ports;
+ sched_poll.timeout = sp32.timeout;
+ } else {
+ if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
+ sizeof(sched_poll))) {
+ *r = -EFAULT;
+ return true;
+ }
+ }
+
if (unlikely(sched_poll.nr_ports > 1)) {
/* Xen (unofficially) limits number of pollers to 128 */
if (sched_poll.nr_ports > 128) {
@@ -1042,6 +1255,10 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
*r = -EFAULT;
goto out;
}
+ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
+ *r = -EINVAL;
+ goto out;
+ }
}
if (sched_poll.nr_ports == 1)
@@ -1215,6 +1432,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
bool longmode;
u64 input, params[6], r = -ENOSYS;
bool handled = false;
+ u8 cpl;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -1242,9 +1460,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ /*
+ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
+ * are permitted in guest userspace can be handled by the VMM.
+ */
+ if (unlikely(cpl > 0))
+ goto handle_in_userspace;
+
switch (input) {
case __HYPERVISOR_xen_version:
if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
@@ -1279,10 +1505,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
if (handled)
return kvm_xen_hypercall_set_result(vcpu, r);
+handle_in_userspace:
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->run->xen.u.hcall.cpl = cpl;
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
@@ -1297,14 +1524,6 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
-static inline int max_evtchn_port(struct kvm *kvm)
-{
- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
- return EVTCHN_2L_NR_CHANNELS;
- else
- return COMPAT_EVTCHN_2L_NR_CHANNELS;
-}
-
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
{
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
@@ -1357,7 +1576,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);
read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1391,7 +1610,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;
read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
@@ -1489,7 +1708,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;
idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
+ rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
@@ -1819,9 +2038,14 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
- kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1829,9 +2053,10 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
@@ -1839,7 +2064,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
void kvm_xen_init_vm(struct kvm *kvm)
{
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
@@ -1847,7 +2072,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
struct evtchnfd *evtchnfd;
int i;
- kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 532a535a9e99..ea33d80a0c51 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -143,11 +143,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
#include <asm/xen/interface.h>
#include <xen/interface/vcpu.h>
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state);
static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
{
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+ kvm_xen_update_runstate(vcpu, RUNSTATE_running);
}
static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
@@ -162,7 +162,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(!vcpu->preempted))
return;
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
}
/* 32-bit compatibility definitions, also used natively in 32-bit build */
@@ -207,4 +207,11 @@ struct compat_vcpu_runstate_info {
uint64_t time[4];
} __attribute__((packed));
+struct compat_sched_poll {
+ /* This is actually a guest virtual address which points to ports. */
+ uint32_t ports;
+ unsigned int nr_ports;
+ uint64_t timeout;
+};
+
#endif /* __ARCH_X86_KVM_XEN_H__ */