summaryrefslogtreecommitdiff
path: root/arch/x86/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-16 00:46:43 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-16 00:46:43 +0300
commitf4b0c4b508364fde023e4f7b9f23f7e38c663dfe (patch)
treed10d9c6602dcd1d2d50effe18ce63edc4d4bb706 /arch/x86/include
parent2e9250022e9f2c9cde3b98fd26dcad1c2a9aedf3 (diff)
parentcba23f333fedf8e39743b0c9787b45a5bd7d03af (diff)
downloadlinux-f4b0c4b508364fde023e4f7b9f23f7e38c663dfe.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "ARM: - Move a lot of state that was previously stored on a per vcpu basis into a per-CPU area, because it is only pertinent to the host while the vcpu is loaded. This results in better state tracking, and a smaller vcpu structure. - Add full handling of the ERET/ERETAA/ERETAB instructions in nested virtualisation. The last two instructions also require emulating part of the pointer authentication extension. As a result, the trap handling of pointer authentication has been greatly simplified. - Turn the global (and not very scalable) LPI translation cache into a per-ITS, scalable cache, making non directly injected LPIs much cheaper to make visible to the vcpu. - A batch of pKVM patches, mostly fixes and cleanups, as the upstreaming process seems to be resuming. Fingers crossed! - Allocate PPIs and SGIs outside of the vcpu structure, allowing for smaller EL2 mapping and some flexibility in implementing more or less than 32 private IRQs. - Purge stale mpidr_data if a vcpu is created after the MPIDR map has been created. - Preserve vcpu-specific ID registers across a vcpu reset. - Various minor cleanups and improvements. LoongArch: - Add ParaVirt IPI support - Add software breakpoint support - Add mmio trace events support RISC-V: - Support guest breakpoints using ebreak - Introduce per-VCPU mp_state_lock and reset_cntx_lock - Virtualize SBI PMU snapshot and counter overflow interrupts - New selftests for SBI PMU and Guest ebreak - Some preparatory work for both TDX and SNP page fault handling. This also cleans up the page fault path, so that the priorities of various kinds of fauls (private page, no memory, write to read-only slot, etc.) are easier to follow. x86: - Minimize amount of time that shadow PTEs remain in the special REMOVED_SPTE state. This is a state where the mmu_lock is held for reading but concurrent accesses to the PTE have to spin; shortening its use allows other vCPUs to repopulate the zapped region while the zapper finishes tearing down the old, defunct page tables. - Advertise the max mappable GPA in the "guest MAXPHYADDR" CPUID field, which is defined by hardware but left for software use. This lets KVM communicate its inability to map GPAs that set bits 51:48 on hosts without 5-level nested page tables. Guest firmware is expected to use the information when mapping BARs; this avoids that they end up at a legal, but unmappable, GPA. - Fixed a bug where KVM would not reject accesses to MSR that aren't supposed to exist given the vCPU model and/or KVM configuration. - As usual, a bunch of code cleanups. x86 (AMD): - Implement a new and improved API to initialize SEV and SEV-ES VMs, which will also be extendable to SEV-SNP. The new API specifies the desired encryption in KVM_CREATE_VM and then separately initializes the VM. The new API also allows customizing the desired set of VMSA features; the features affect the measurement of the VM's initial state, and therefore enabling them cannot be done tout court by the hypervisor. While at it, the new API includes two bugfixes that couldn't be applied to the old one without a flag day in userspace or without affecting the initial measurement. When a SEV-ES VM is created with the new VM type, KVM_GET_REGS/KVM_SET_REGS and friends are rejected once the VMSA has been encrypted. Also, the FPU and AVX state will be synchronized and encrypted too. - Support for GHCB version 2 as applicable to SEV-ES guests. This, once more, is only accessible when using the new KVM_SEV_INIT2 flow for initialization of SEV-ES VMs. x86 (Intel): - An initial bunch of prerequisite patches for Intel TDX were merged. They generally don't do anything interesting. The only somewhat user visible change is a new debugging mode that checks that KVM's MMU never triggers a #VE virtualization exception in the guest. - Clear vmcs.EXIT_QUALIFICATION when synthesizing an EPT Misconfig VM-Exit to L1, as per the SDM. Generic: - Use vfree() instead of kvfree() for allocations that always use vcalloc() or __vcalloc(). - Remove .change_pte() MMU notifier - the changes to non-KVM code are small and Andrew Morton asked that I also take those through the KVM tree. The callback was only ever implemented by KVM (which was also the original user of MMU notifiers) but it had been nonfunctional ever since calls to set_pte_at_notify were wrapped with invalidate_range_start and invalidate_range_end... in 2012. Selftests: - Enhance the demand paging test to allow for better reporting and stressing of UFFD performance. - Convert the steal time test to generate TAP-friendly output. - Fix a flaky false positive in the xen_shinfo_test due to comparing elapsed time across two different clock domains. - Skip the MONITOR/MWAIT test if the host doesn't actually support MWAIT. - Avoid unnecessary use of "sudo" in the NX hugepage test wrapper shell script, to play nice with running in a minimal userspace environment. - Allow skipping the RSEQ test's sanity check that the vCPU was able to complete a reasonable number of KVM_RUNs, as the assert can fail on a completely valid setup. If the test is run on a large-ish system that is otherwise idle, and the test isn't affined to a low-ish number of CPUs, the vCPU task can be repeatedly migrated to CPUs that are in deep sleep states, which results in the vCPU having very little net runtime before the next migration due to high wakeup latencies. - Define _GNU_SOURCE for all selftests to fix a warning that was introduced by a change to kselftest_harness.h late in the 6.9 cycle, and because forcing every test to #define _GNU_SOURCE is painful. - Provide a global pseudo-RNG instance for all tests, so that library code can generate random, but determinstic numbers. - Use the global pRNG to randomly force emulation of select writes from guest code on x86, e.g. to help validate KVM's emulation of locked accesses. - Allocate and initialize x86's GDT, IDT, TSS, segments, and default exception handlers at VM creation, instead of forcing tests to manually trigger the related setup. Documentation: - Fix a goof in the KVM_CREATE_GUEST_MEMFD documentation" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (225 commits) selftests/kvm: remove dead file KVM: selftests: arm64: Test vCPU-scoped feature ID registers KVM: selftests: arm64: Test that feature ID regs survive a reset KVM: selftests: arm64: Store expected register value in set_id_regs KVM: selftests: arm64: Rename helper in set_id_regs to imply VM scope KVM: arm64: Only reset vCPU-scoped feature ID regs once KVM: arm64: Reset VM feature ID regs from kvm_reset_sys_regs() KVM: arm64: Rename is_id_reg() to imply VM scope KVM: arm64: Destroy mpidr_data for 'late' vCPU creation KVM: arm64: Use hVHE in pKVM by default on CPUs with VHE support KVM: arm64: Fix hvhe/nvhe early alias parsing KVM: SEV: Allow per-guest configuration of GHCB protocol version KVM: SEV: Add GHCB handling for termination requests KVM: SEV: Add GHCB handling for Hypervisor Feature Support requests KVM: SEV: Add support to handle AP reset MSR protocol KVM: x86: Explicitly zero kvm_caps during vendor module load KVM: x86: Fully re-initialize supported_mce_cap on vendor module load KVM: x86: Fully re-initialize supported_vm_types on vendor module load KVM: x86/mmu: Sanity check that __kvm_faultin_pfn() doesn't create noslot pfns KVM: x86/mmu: Initialize kvm_page_fault's pfn and hva to error values ...
Diffstat (limited to 'arch/x86/include')
-rw-r--r--arch/x86/include/asm/fpu/api.h3
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h63
-rw-r--r--arch/x86/include/asm/sev-common.h8
-rw-r--r--arch/x86/include/asm/vmx.h13
-rw-r--r--arch/x86/include/uapi/asm/kvm.h22
6 files changed, 80 insertions, 30 deletions
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a2be3aefff9f..f86ad3335529 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -143,6 +143,9 @@ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfe
extern u64 xstate_get_guest_group_perm(void);
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+
/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 110d7f29ca9a..5187fcf4b610 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -121,6 +121,7 @@ KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
+KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6efd1497b026..ece45b3f6f20 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -254,28 +254,31 @@ enum x86_intercept_stage;
KVM_GUESTDBG_INJECT_DB | \
KVM_GUESTDBG_BLOCKIRQ)
+#define PFERR_PRESENT_MASK BIT(0)
+#define PFERR_WRITE_MASK BIT(1)
+#define PFERR_USER_MASK BIT(2)
+#define PFERR_RSVD_MASK BIT(3)
+#define PFERR_FETCH_MASK BIT(4)
+#define PFERR_PK_MASK BIT(5)
+#define PFERR_SGX_MASK BIT(15)
+#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
+#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
+#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
+#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-#define PFERR_PK_BIT 5
-#define PFERR_SGX_BIT 15
-#define PFERR_GUEST_FINAL_BIT 32
-#define PFERR_GUEST_PAGE_BIT 33
-#define PFERR_IMPLICIT_ACCESS_BIT 48
-
-#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
-#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
-#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
-#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
-#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
+/*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
+ * when emulating instructions that triggers implicit access.
+ */
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(48)
+/*
+ * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
+ * when the guest was accessing private memory.
+ */
+#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
+#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
PFERR_WRITE_MASK | \
@@ -994,9 +997,6 @@ struct kvm_vcpu_arch {
u64 msr_kvm_poll_control;
- /* set at EPT violation at this point */
- unsigned long exit_qualification;
-
/* pv related host specific info */
struct {
bool pv_unhalted;
@@ -1280,12 +1280,14 @@ enum kvm_apicv_inhibit {
};
struct kvm_arch {
- unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
+ u8 vm_type;
+ bool has_private_mem;
+ bool has_protected_state;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -1312,6 +1314,8 @@ struct kvm_arch {
*/
spinlock_t mmu_unsync_pages_lock;
+ u64 shadow_mmio_value;
+
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1779,6 +1783,7 @@ struct kvm_x86_ops {
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
#endif
+ int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
@@ -1844,6 +1849,7 @@ struct kvm_arch_async_pf {
gfn_t gfn;
unsigned long cr3;
bool direct_map;
+ u64 error_code;
};
extern u32 __read_mostly kvm_nr_uret_msrs;
@@ -2140,6 +2146,10 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
@@ -2153,8 +2163,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
+
#ifdef CONFIG_KVM_PRIVATE_MEM
-#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b463fcbd4b90..5a8246dd532f 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -54,8 +54,10 @@
(((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
/* GHCB GPA Register */
#define GHCB_MSR_REG_GPA_REQ 0x012
@@ -99,6 +101,8 @@ enum psc_op {
/* GHCB Hypervisor Feature Request/Response */
#define GHCB_MSR_HV_FT_REQ 0x080
#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_POS 12
+#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0)
#define GHCB_MSR_HV_FT_RESP_VAL(v) \
/* GHCBData[63:12] */ \
(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4dba17363008..d77a31039f24 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,6 +71,7 @@
#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -226,6 +227,8 @@ enum vmcs_field {
VMREAD_BITMAP_HIGH = 0x00002027,
VMWRITE_BITMAP = 0x00002028,
VMWRITE_BITMAP_HIGH = 0x00002029,
+ VE_INFORMATION_ADDRESS = 0x0000202A,
+ VE_INFORMATION_ADDRESS_HIGH = 0x0000202B,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
ENCLS_EXITING_BITMAP = 0x0000202E,
@@ -514,6 +517,7 @@ enum vmcs_field {
#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
@@ -630,4 +634,13 @@ enum vmx_l1d_flush_state {
extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+struct vmx_ve_information {
+ u32 exit_reason;
+ u32 delivery;
+ u64 exit_qualification;
+ u64 guest_linear_address;
+ u64 guest_physical_address;
+ u16 eptp_index;
+};
+
#endif
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index ef11aa4cab42..9fae1b73b529 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -457,8 +457,13 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
-/* attributes for system fd (group 0) */
-#define KVM_X86_XCOMP_GUEST_SUPP 0
+/* vendor-independent attributes for system fd (group 0) */
+#define KVM_X86_GRP_SYSTEM 0
+# define KVM_X86_XCOMP_GUEST_SUPP 0
+
+/* vendor-specific groups and attributes for system fd */
+#define KVM_X86_GRP_SEV 1
+# define KVM_X86_SEV_VMSA_FEATURES 0
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -689,6 +694,9 @@ enum sev_cmd_id {
/* Guest Migration Extension */
KVM_SEV_SEND_CANCEL,
+ /* Second time is the charm; improved versions of the above ioctls. */
+ KVM_SEV_INIT2,
+
KVM_SEV_NR_MAX,
};
@@ -700,6 +708,14 @@ struct kvm_sev_cmd {
__u32 sev_fd;
};
+struct kvm_sev_init {
+ __u64 vmsa_features;
+ __u32 flags;
+ __u16 ghcb_version;
+ __u16 pad1;
+ __u32 pad2[8];
+};
+
struct kvm_sev_launch_start {
__u32 handle;
__u32 policy;
@@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
+#define KVM_X86_SEV_VM 2
+#define KVM_X86_SEV_ES_VM 3
#endif /* _ASM_X86_KVM_H */